In [1]:
import pandas as pd
from datetime import datetime

# clean text
import re
import contractions
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import spacy
nlp = spacy.load('en_core_web_sm')

# Import and Clean Data

In [2]:
stop_words.extend(["'s", "b", "x"])
def clean_text(text):
    # remove any @ mentions and extra white spaces
    filtered_text = " ".join(filter(lambda x: x[0]!='@', text.split())).strip()
    
    # remove URLs
    filtered_text = " ".join([x for x in filtered_text.split() if 'http' not in x])
    
    # remove special characters/numbers and convert to lowercase
    filtered_text = re.sub("[^a-zA-Z]", ' ', filtered_text).lower()
    
    # fix characters that repeat 3+ times
    filtered_text = re.sub(r'(.)\1+', r'\1\1', filtered_text)
    
    # expand contractions
    filtered_text = " ".join([contractions.fix(x) for x in filtered_text.split()])
    
    # transform words into root words
    string = nlp(filtered_text)
    filtered_text = " ".join([word.lemma_.lower() for word in string])
    
    # remove stop words
    filtered_text = " ".join([word for word in filtered_text.split() if not word in stop_words])
    
    return filtered_text

In [3]:
# import Opiates data
df1 = pd.read_csv('SubredditDatasets/opiates.csv')
opiates = df1[(df1['body'].notnull()) & (df1['created']<=1622519999)].drop_duplicates().reset_index(drop=True).copy()
opiates['date'] = pd.to_datetime(opiates['created'].apply(lambda x: datetime.fromtimestamp(x).strftime('%Y-%m-%d')))
opiates = opiates.drop(columns=['title', 'created', 'id', 'url'])

# clean dataset
opiates['cleaned_text'] = opiates['body'].apply(clean_text)

# add tokens column
opiates['tokens'] = [post.split() for post in opiates['cleaned_text']]
opiates['post_length'] = opiates['tokens'].apply(lambda x: len(x))

In [4]:
# import Opiates Recovery data
df2 = pd.read_csv('SubredditDatasets/recovery.csv')
recovery = df2[(df2['body'].notnull()) & (df2['created']<=1622519999)].drop_duplicates().reset_index(drop=True).copy()
recovery['date'] = pd.to_datetime(recovery['created'].apply(lambda x: datetime.fromtimestamp(x).strftime('%Y-%m-%d')))
recovery = recovery.drop(columns=['title','created', 'id', 'url'])

# clean dataset
recovery['cleaned_text'] = recovery['body'].apply(clean_text)

# add tokens column
recovery['tokens'] = [post.split() for post in recovery['cleaned_text']]
recovery['post_length'] = recovery['tokens'].apply(lambda x: len(x))

In [5]:
opiates = opiates[opiates['cleaned_text']!='']
recovery = recovery[recovery['cleaned_text']!='']

In [6]:
opiates.to_csv('SubredditDatasets/opiates_cleaned.csv', index=False)
recovery.to_csv('SubredditDatasets/recovery_cleaned.csv', index=False)