## Preprocessing

In [1]:
# Doing the necessary import to carry out data cleaning. 
import pandas as pd
import re
import numpy as np

from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, plot_roc_curve, roc_auc_score, \
                            accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [2]:
# Read df.
df = pd.read_csv('../data/hf_df_no_outliers.csv')

print(df.shape)
df.head()

(211214, 32)


Unnamed: 0,text,author,subreddit,admiration,amusement,anger,annoyance,approval,caring,confusion,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,post_length
0,That game hurt.,Brdd9,nrl,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,15
1,>sexuality shouldn’t be a grouping category I...,TheGreen888,unpopularopinion,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,127
2,"You do right, if you don't care then fuck 'em!",Labalool,confessions,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,46
3,Man I love reddit.,MrsRobertshaw,facepalm,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,18
4,"[NAME] was nowhere near them, he was by the Fa...",American_Fascist713,starwarsspeculation,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,52


In [3]:
# Sieving out only the emotion columns. 
df_temp = df.copy()
df_temp.drop(['text','author', 'subreddit', 'post_length'], axis=1, inplace=True)

In [4]:
# To create a new column that consolidates the emotion the post displays.
df['emotion']=df_temp.idxmax(axis=1)

In [5]:
# Check df.
df

Unnamed: 0,text,author,subreddit,admiration,amusement,anger,annoyance,approval,caring,confusion,...,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,post_length,emotion
0,That game hurt.,Brdd9,nrl,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,15,sadness
1,>sexuality shouldn’t be a grouping category I...,TheGreen888,unpopularopinion,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,127,admiration
2,"You do right, if you don't care then fuck 'em!",Labalool,confessions,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,46,neutral
3,Man I love reddit.,MrsRobertshaw,facepalm,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,18,love
4,"[NAME] was nowhere near them, he was by the Fa...",American_Fascist713,starwarsspeculation,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,52,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211209,Everyone likes [NAME].,Senshado,heroesofthestorm,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,22,love
211210,Well when you’ve imported about a gazillion of...,5inchloser,nottheonion,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,88,caring
211211,That looks amazing,springt1me,shittyfoodporn,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,18,admiration
211212,The FDA has plenty to criticize. But like here...,enamedata,medicine,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,148,anger


In [6]:
# Checking the emotions I have so I can assign them numerical values accordingly.
df['emotion'].unique()

array(['sadness', 'admiration', 'neutral', 'love', 'gratitude',
       'disapproval', 'amusement', 'disappointment', 'realization',
       'annoyance', 'confusion', 'optimism', 'curiosity', 'excitement',
       'caring', 'disgust', 'remorse', 'joy', 'approval', 'embarrassment',
       'surprise', 'anger', 'grief', 'pride', 'desire', 'relief', 'fear',
       'nervousness'], dtype=object)

**Assigning numbers to classes**

In [7]:
# Assigning the negative emotions the smaller numbers, with neutral taking 13 and rest of the positive >=14.
df['emotion_num'] = df['emotion'].map({'grief' : 1, 'remorse' : 2, 'disgust': 3, 'anger': 4, 'fear': 5,
       'annoyance': 6, 'confusion': 7, 'nervousness': 8, 'disapproval': 9,
       'disappointment': 10, 'sadness': 11, 'embarrassment': 12, 'neutral': 13, 'realization': 14,
       'relief': 15, 'approval': 16, 'curiousity': 17, 'amusement': 18, 'caring': 19, 'joy': 20,
       'admiration': 21, 'surprise': 22, 'gratitude': 23, 'love': 24, 'optimism': 25, 'pride': 26, 'desire': 27,
       'excitement': 28})

In [8]:
# Checking the new column for missing values since some rows may not have had emotions assigned. 
df[df['emotion_num'].isna()]

Unnamed: 0,text,author,subreddit,admiration,amusement,anger,annoyance,approval,caring,confusion,...,pride,realization,relief,remorse,sadness,surprise,neutral,post_length,emotion,emotion_num
23,Now I'm wondering on what I've been missing ou...,JonJonRegayov,morbidquestions,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,74,curiosity,
66,Damn. That’s profound. Did you graduate from H...,Mahgugu,gatekeeping,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,53,curiosity,
71,"The service made it 20 years ago (1999, yeah i...",NOT_the_MI6,facepalm,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,73,curiosity,
74,Who exactly are their ‘volunteers’? The IRA?,TrooperNI,northernireland,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,44,curiosity,
87,I wanted to know if there’s was another like m...,ZeroDeadlySin,askwomenadvice,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,105,curiosity,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211063,"Tell me it's real, the feeling that we feel",TeopEvol,HadToHurt,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,43,curiosity,
211068,That guy who said that it's a good idea to nev...,Plays-0-Cost-Cards,seduction,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,114,curiosity,
211133,"Ok I get that, like I said, different morals f...",WakeoftheStorm,datingoverthirty,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,123,curiosity,
211163,"Ok I get that, like I said, different morals f...",WakeoftheStorm,datingoverthirty,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,123,curiosity,


In [9]:
# Dropping those 7k+ rows with no emotions. My dataset is sufficient so no problem dropping these.
df.dropna(subset=['emotion_num'],inplace=True)

In [10]:
# Making my category an int instead of the float it was.
df['emotion_num']=df['emotion_num'].astype('int32')

**Clean up text**

In [11]:
# Creating a function to clean my text later. 
def cleaner(text):
    
    # Remove non ASCII characters.
    text = text.encode('ascii',errors='ignore').decode()
    
    # Remove [deleted] and [removed] tags
    text = re.sub(r'\[[^()]*\]', "", text)

    # Remove HTML special entities (e.g. &amp;)
    text = re.sub(r'\&\w*;', '', text)
    
    # Remove hyperlinks
    text = re.sub(r'http\S+', "", text)
    
    # Remove punctuation and split 's, 't, 've with a space for filter
    text = re.sub("[^\w\s]", "", text)
    
    # Remove non-standard characters eg. chinese/tamil characters
    text = re.sub(r'[^0-9a-zA-Z\s]','', text)
    
    # Make lowercase
    text = text.lower()
    
    return text

In [12]:
# apply cleaner function to clean text, save results in new column called cleaned_text.
df['cleaned_text'] = df['text'].map(cleaner)

**Dealing with stopwords**

In [13]:
stop = stopwords.words("english") # declare stopword

In [14]:
# Create new column with stopword removed
df['no_stopword_text'] = df['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [15]:
# Instantiating.
lemmatizer = WordNetLemmatizer()

In [16]:
# Lemmatize my text column.
df['lemm_text'] = [lemmatizer.lemmatize(w) for w in df['no_stopword_text']]

In [17]:
# Initializing tfidf.
tvec = TfidfVectorizer()

In [19]:
# Define my X.
X = pd.DataFrame(
    tvec.fit_transform(df['lemm_text']).todense(),
    columns = tvec.get_feature_names()
)

In [20]:
# Define my y.
y = df['emotion_num']

In [22]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=42)

MemoryError: Unable to allocate 37.1 GiB for an array with shape (152630, 32649) and data type float64