In [1]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
import nltk
import re
import string

In [2]:
reddit_df = pd.read_csv("Reddit-Threads_2022-2023.csv")
reddit_df['original text'] = reddit_df['text'].copy()

# remove stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# stop words include: a, about, above, after, again, against, all, am, an, and, any, are, aren't, as, at, be, because, been, before, being, below, between, both, but, by, can't, cannot, could, couldn't, did, didn't, do, does, doesn't, doing, don't, down, during, each, few, for, from, further, had, hadn't, has, hasn't, have, haven't, having, he, he'd, he'll, he's, her, here, here's, hers, herself, him, himself, his, how, how's, i, i'd, i'll, i'm, i've, if, in, into, is, isn't, it, it's, its, itself, let's, me, more, most, mustn't, my, myself, no, nor, not, of, off, on, once, only, or, other, ought, our, ours, ourselves, out, over, own, same, shan't, she, she'd, she'll, she's, should, shouldn't, so, some, such, than, that, that's, the, their, theirs, them, themselves, then, there, there's, these, they, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, very, was, wasn't, we, we'd, we'll, we're, we've, were, weren't, what, what's, when, when's, where, where's, which, while, who, who's, whom, why, why's, with, won't, would, wouldn't, you, you'd, you'll, you're, you've, your, yours, yourself, yourselves

#add Singlish stop words
singlish_stopwords = {'ah', 'ah beng', 'ah boh', 'ahbeng', 'ahlian', 'ahyo', 'aiya', 'aiyo', 'aiyoh', 'alamak', 'also', 'already', 
    'ang moh', 'angkat', 'anyhow', 'arbo', 'atas', 'bah', 'balik', 'bao', 'bo', 'bochap', 'boh eng', 'boh liao', 
    'boleh', 'bro', 'brudder', 'bruh', 'buay', 'can', 'can lah', 'chao', 'cheem', 'chey', 'chiobu', 'chio', 
    'chiong', 'chop chop', 'dai ji', 'den', 'dude', 'dun', 'eh', 'ehh', 'even', 'gao', 'got meh', 'guai', 
    'hao lian', 'heng', 'hor', 'huh', 'is it', 'issit', 'izzit', 'jalan jalan', 'jialat', 'jin', 'just', 'kena', 
    'kiasi', 'keke', 'kena', 'kopi', 'kwai', 'la', 'la kopi', 'lah', 'lahh', 'lahz', 'laks', 'liao', 'liaoz', 
    'liaozz', 'like', 'lor', 'lor lor', 'lorh', 'lorz', 'mai', 'makan', 'makan makan', 'machiam', 'mah', 'malu', 
    'maybe', 'meh', 'nah', 'nia', 'no lah', 'nope', 'not', 'ok', 'ok lah', 'okie', 'omg', 'onz', 'pai seh', 
    'paiseh', 'really', 'relak', 'right', 'sial', 'sialz', 'sia lah', 'siaoz', 'sian', 'sianz', 'shiok', 
    'shiok lah', 'simi', 'sis', 'steady', 'still', 'suka', 'suka suka', 'suay', 'tsk', 'tahan', 'tapao', 
    'terok', 'then', 'tink', 'tio', 'tok', 'true', 'wah', 'wah lao', 'wah piang', 'wahhh', 'waseh', 'what', 
    'when', 'where', 'who', 'yah', 'yet', 'yeap', 'yea', 'yup', 'zai', 'zhun'
}

stop_words.update(singlish_stopwords)

# To lowercase
reddit_df['text'] = reddit_df['text'].str.lower()

# Remove punctuation using str.replace() and string.punctuation
reddit_df['text'] = reddit_df['text'].str.replace(f'[{string.punctuation}]', '', regex=True)

# Convert all values to strings, replace NaNs with an empty string
reddit_df['text'] = reddit_df['text'].fillna('').astype(str)
reddit_df['text'] = reddit_df['text'].apply(lambda x: ' '.join(word for word in x.split() if word.lower() not in stop_words))

# remove chinese characters
reddit_df['text'] = reddit_df['text'].str.replace(r'[\u4e00-\u9fff]+', '', regex=True)

# Remove '&gt;' from the 'text' column
reddit_df['text'] = reddit_df['text'].str.replace('&gt;', '', regex=False)

# remove emoji and other characters
def remove_non_ascii(text):
    # Keep only ASCII characters (characters with codes between 0 and 127)
    return re.sub(r'[^\x00-\x7F]+', '', text)

reddit_df['text'] = reddit_df['text'].astype(str).apply(remove_non_ascii)

# remove '[deleted]', '.', '[removed]', ''
toremove = ['deleted', 'removed', '']
reddit_df = reddit_df[~reddit_df['text'].isin(toremove)]

[nltk_data] Downloading package stopwords to /Users/eushae-
[nltk_data]     anne/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# save as csv file
reddit_df.to_csv("cleaned_data_2223.csv")

In [6]:
chunks = []
for chunk in pd.read_csv("Reddit-Threads_2020-2021.csv", engine='python', chunksize=1000, on_bad_lines='skip'):
  chunks.append(chunk)

reddit_df = pd.concat(chunks, ignore_index=True)

In [7]:
reddit_df['original text'] = reddit_df['text'].copy()

# remove stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# stop words include: a, about, above, after, again, against, all, am, an, and, any, are, aren't, as, at, be, because, been, before, being, below, between, both, but, by, can't, cannot, could, couldn't, did, didn't, do, does, doesn't, doing, don't, down, during, each, few, for, from, further, had, hadn't, has, hasn't, have, haven't, having, he, he'd, he'll, he's, her, here, here's, hers, herself, him, himself, his, how, how's, i, i'd, i'll, i'm, i've, if, in, into, is, isn't, it, it's, its, itself, let's, me, more, most, mustn't, my, myself, no, nor, not, of, off, on, once, only, or, other, ought, our, ours, ourselves, out, over, own, same, shan't, she, she'd, she'll, she's, should, shouldn't, so, some, such, than, that, that's, the, their, theirs, them, themselves, then, there, there's, these, they, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, very, was, wasn't, we, we'd, we'll, we're, we've, were, weren't, what, what's, when, when's, where, where's, which, while, who, who's, whom, why, why's, with, won't, would, wouldn't, you, you'd, you'll, you're, you've, your, yours, yourself, yourselves

#add Singlish stop words
singlish_stopwords = {'ah', 'ah beng', 'ah boh', 'ahbeng', 'ahlian', 'ahyo', 'aiya', 'aiyo', 'aiyoh', 'alamak', 'also', 'already', 
    'ang moh', 'angkat', 'anyhow', 'arbo', 'atas', 'bah', 'balik', 'bao', 'bo', 'bochap', 'boh eng', 'boh liao', 
    'boleh', 'bro', 'brudder', 'bruh', 'buay', 'can', 'can lah', 'chao', 'cheem', 'chey', 'chiobu', 'chio', 
    'chiong', 'chop chop', 'dai ji', 'den', 'dude', 'dun', 'eh', 'ehh', 'even', 'gao', 'got meh', 'guai', 
    'hao lian', 'heng', 'hor', 'huh', 'is it', 'issit', 'izzit', 'jalan jalan', 'jialat', 'jin', 'just', 'kena', 
    'kiasi', 'keke', 'kena', 'kopi', 'kwai', 'la', 'la kopi', 'lah', 'lahh', 'lahz', 'laks', 'liao', 'liaoz', 
    'liaozz', 'like', 'lor', 'lor lor', 'lorh', 'lorz', 'mai', 'makan', 'makan makan', 'machiam', 'mah', 'malu', 
    'maybe', 'meh', 'nah', 'nia', 'no lah', 'nope', 'not', 'ok', 'ok lah', 'okie', 'omg', 'onz', 'pai seh', 
    'paiseh', 'really', 'relak', 'right', 'sial', 'sialz', 'sia lah', 'siaoz', 'sian', 'sianz', 'shiok', 
    'shiok lah', 'simi', 'sis', 'steady', 'still', 'suka', 'suka suka', 'suay', 'tsk', 'tahan', 'tapao', 
    'terok', 'then', 'tink', 'tio', 'tok', 'true', 'wah', 'wah lao', 'wah piang', 'wahhh', 'waseh', 'what', 
    'when', 'where', 'who', 'yah', 'yet', 'yeap', 'yea', 'yup', 'zai', 'zhun'
}

stop_words.update(singlish_stopwords)

# To lowercase
reddit_df['text'] = reddit_df['text'].str.lower()

# Remove punctuation using str.replace() and string.punctuation
reddit_df['text'] = reddit_df['text'].str.replace(f'[{string.punctuation}]', '', regex=True)

# Convert all values to strings, replace NaNs with an empty string
reddit_df['text'] = reddit_df['text'].fillna('').astype(str)
reddit_df['text'] = reddit_df['text'].apply(lambda x: ' '.join(word for word in x.split() if word.lower() not in stop_words))

# remove chinese characters
reddit_df['text'] = reddit_df['text'].str.replace(r'[\u4e00-\u9fff]+', '', regex=True)

# Remove '&gt;' from the 'text' column
reddit_df['text'] = reddit_df['text'].str.replace('&gt;', '', regex=False)

# remove emoji and other characters
def remove_non_ascii(text):
    # Keep only ASCII characters (characters with codes between 0 and 127)
    return re.sub(r'[^\x00-\x7F]+', '', text)

reddit_df['text'] = reddit_df['text'].astype(str).apply(remove_non_ascii)

# remove '[deleted]', '.', '[removed]', ''
toremove = ['deleted', 'removed', '']
reddit_df = reddit_df[~reddit_df['text'].isin(toremove)]

[nltk_data] Downloading package stopwords to /Users/eushae-
[nltk_data]     anne/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# save as csv file
reddit_df.to_csv("cleaned_data_2021.csv")

## Filter for comments longer than 8 words 

In [18]:
data_2020 = pd.read_csv("cleaned_data_2021.csv", index_col=0)
data_2020['timestamp'] = pd.to_datetime(data_2020['timestamp'])
data_2020['comment_length'] = data_2020['text'].str.split(' ').str.len()
data_2020['text'] = data_2020['text'].astype(str)

data_2020_long = data_2020[data_2020['comment_length'] > 8]

data_2020_long.to_csv("data_2020_long.csv")

In [19]:
data_2022 = pd.read_csv("cleaned_data_2223.csv", index_col=0)
data_2022['timestamp'] = pd.to_datetime(data_2022['timestamp'])
data_2022['comment_length'] = data_2022['text'].str.split(' ').str.len()
data_2022['text'] = data_2022['text'].astype(str)

data_2022_long = data_2022[data_2022['comment_length'] > 8]

data_2022_long.to_csv("data_2022_long.csv")