# Data Cleaning
### Parsing, cleaning and structuring the journal data
### 1. Read in the data

In [None]:
import pandas as pd
import numpy as np
import re
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk

# Download stopwords if not already available
nltk.download('stopwords')

# Path to Excel file
file_path = "../data/raw/daily_sentences.xlsx"

# Read all sheets as a dictionary of DataFrames
sheets = pd.read_excel(file_path, sheet_name=None)
df_list = []
for sheet_name, df in sheets.items():
    df[['date', 'sentence']] = df[f'Daily Sentence {sheet_name}'].str.split(' ', n=1, expand=True)
    df['date'] = df['date'].astype(str) + '/' + str(sheet_name) # Adds the year to the date
    df_list.append(df)

# Combine all sheets into one DataFrame
df = pd.concat(df_list, ignore_index=True)
df = df[['date', 'sentence']] # Reduce columns

# Convert 'Date' column to datetime format
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y', errors='raise')
df['sentence'] = df['sentence'].fillna('')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.to_period('M')
df['day'] = df['date'].dt.day

print("Length of DataFrame before cleaning:", len(df))

Length of DataFrame before cleaning: 1299


### 2. Clean the data
Check for duplicates/NaNs - I accidently put '02-26' instead of '03-26' and put '11-27' instead of '11-28'

NOTE: This would need to be updated and improved if using on a different dataset

In [16]:
# Error checking for duplicate days and entries
duplicates = df[df.duplicated('date', keep=False)]
#print(duplicates)

if len(duplicates) > 0:
    print(f"{len(duplicates)} duplicates found...")
    # There are two duplicates entires. One has the wrong day and one has the wrong month
    df.loc[84, 'date'] = '2025-03-26'
    df.loc[922, 'date'] = '2021-11-28'
    # Recheck for duplicates
    duplicates = df[df.duplicated('date', keep=False)]
    print(f"{len(duplicates)} duplicates remaining")
else:
    print("No duplicates found")


# Checking for any null dates
print(df['date'].isna().sum(), "null dates found")

4 duplicates found...
0 duplicates remaining
0 null dates found


Clean and standardize the text

In [18]:
# Get the list of English stop words and add custom ones
stop_words = set(stopwords.words('english'))
custom_stop_words = {'got', 'went', 'saw', 'made', 'played', 'home', 'drove', 'day', 'took'}
stop_words.update(custom_stop_words) 

pattern = re.compile(r'\b\w+\b')

# Function to clean, tokenize, and filter text
def tokenize_and_filter(text):
    if not isinstance(text, str):
        return ""
    words = pattern.findall(text.lower())
    return ' '.join([w for w in words if w not in stop_words])


df['cleaned_words'] = df['sentence'].apply(tokenize_and_filter)
df.head(3)

Unnamed: 0,date,sentence,year,month,day,cleaned_words
0,2025-01-01,Got back home at like 4 and slept in. Got cava...,2025,2025-01,1,back like 4 slept cava cookies
1,2025-01-02,Got taco chinoz and boba with guys and Oscar a...,2025,2025-01,2,taco chinoz boba guys oscar almost folded yurie
2,2025-01-03,Got new tires. Started snowing. Got Ko Hyang w...,2025,2025-01,3,new tires started snowing ko hyang yurie mall ...


Another variation of cleaned text

In [19]:
import spacy
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def clean_with_spacy(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc 
              if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

df['cleaned_words2'] = df['sentence'].astype(str).apply(clean_with_spacy)



Output the results

In [20]:
df.to_excel("../data/cleaned/daily_sentences_cleaned.xlsx", index=False)