# Sentiment Classification on Text Data 

**Authors:** <br>
Ian Colson (colso031) <br>
Maitrayee Deka (deka0031)<br>
Sneha Patri(patri316) <br>
Aditi Patil (patil112) <br>
Benjamin Swenson(swen0754) <br>

---

## Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("combined_data.csv")
df

Unnamed: 0,sentiment,Text
0,sadness,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,happy,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
...,...,...
61454,fear,Melissa stared at her friend in dism
61455,happy,Successive state elections have seen the gover...
61456,fear,Vincent was irritated but not dismay
61457,happy,Kendall-Hume turned back to face the dismayed ...


In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

stopWords = stopwords.words("english")

def cleanText(text):
    
    output = ""
    
    
    for word in text.split():
        cleanWord = re.sub("[^a-zA-Z]",  # Search for all non-letters
                          " ",          # Replace all non-letters with spaces
                          str(word))
        if cleanWord in stopWords:
            pass
        else:
            output = output + " " + cleanWord
    return output

In [4]:
df.Text = df.Text.apply(lambda x: cleanText(x))

In [5]:
import re
df['Text'] = df['Text'].apply(lambda x: re.sub("(@[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)",'',x))
df.dropna(inplace=True)

In [6]:
import snowballstemmer
ss = snowballstemmer.stemmer('english')
def replace(x):
    words = x.split()
    newtext = ''
    for w in words:
        n = ss.stemWord(w)
        newtext += n
        newtext += " "
    return newtext
# Applying the stemmer
df['Text'] = df['Text'].apply(lambda x: replace(x))
df.head()

Unnamed: 0,sentiment,Text
0,sadness,tiffanylu know listenin bad habit earlier star...
1,sadness,Layin n bed headach ughhhh waitin call
2,sadness,Funer ceremoni gloomi friday
3,happy,want hang friend SOON
4,neutral,dannycastillo We want trade someon Houston tic...


In [7]:
# function for stratifying the dataset (oversampling)
def stratify(data, N):
    rows = []
    fear = data[data['sentiment'] == 'fear']
    happy = data[data['sentiment'] == 'happy']
    sad = data[data['sentiment'] == 'sadness']
    neutral = data[data['sentiment'] == 'neutral']
    love = data[data['sentiment'] == 'love']
    anger = data[data['sentiment'] == 'anger']
    surprise = data[data['sentiment'] == 'surprise']
    relief = data[data['sentiment'] == 'relief']
    
    for i in range(N):
        #print(fear.loc[np.random.choice(fear.index)])
        rows.append(fear.loc[np.random.choice(fear.index)])
        rows.append(happy.loc[np.random.choice(happy.index)])
        rows.append(sad.loc[np.random.choice(sad.index)])
        rows.append(neutral.loc[np.random.choice(neutral.index)])
        rows.append(love.loc[np.random.choice(love.index)])
        rows.append(anger.loc[np.random.choice(anger.index)])
        rows.append(surprise.loc[np.random.choice(surprise.index)])
        rows.append(relief.loc[np.random.choice(relief.index)])
    sentiments = [x['sentiment'] for x in rows]
    texts = [x['Text'] for x in rows]
    d = {'sentiment': sentiments, 'Text': texts}
    return pd.DataFrame(d)

#### Train test split

In [8]:
from sklearn.model_selection import train_test_split
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
# Balanced_Train, Balanced_Test = train_test_split(df, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(df['Text'], 
                                                    df['sentiment'],test_size=0.20, 
                                                    random_state=42)
# Notice here that We are splitting the X and y components

In [9]:
# join test and train X and y's together to stratify
data_train = pd.concat([X_train,y_train],axis=1)
data_test =  pd.concat([X_test,y_test],axis=1)
# balanced data frame with X and y
train_balanced = stratify(data_train, 8000)
test_balanced  = stratify(data_test, 2000)

In [10]:
# create X and y vectors for balanced training set
strat_X_train = train_balanced["Text"]
strat_y_train = train_balanced["sentiment"]
# create X and y vectors for balanced test set
strat_X_test = test_balanced["Text"]
strat_y_test = test_balanced["sentiment"]

### Creating a document matrix

#### Train set

In [11]:
# Applying the count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english', max_features = 10000)
# unbalanced train
X_train_counts = count_vect.fit_transform(X_train)
# balanced train
X_train_strat_counts = count_vect.fit_transform(strat_X_train)

# Applying the TFidf transformer
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
# unbalanced train
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# balanced train
X_train_strat_tfidf = tfidf_transformer.fit_transform(X_train_strat_counts)

#### Test set

In [12]:
# unbalanced testset
# count vectorizer
X_test_counts = count_vect.transform(X_test)
# Tfidf transformer
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# balanced testset
# count vectorizer
X_test_strat_counts = count_vect.transform(strat_X_test)
# Tfidf transformer
X_test_strat_tfidf = tfidf_transformer.transform(X_test_strat_counts)

## ANN

## Naive Bayes

In [13]:
from sklearn.naive_bayes import MultinomialNB

In [14]:
#Training on unbalanced data set¶
clf1 = MultinomialNB().fit(X_train_tfidf, y_train)
# Training on a balanced dataset
clf2 = MultinomialNB().fit(X_train_strat_tfidf, strat_y_train)

In [15]:
# training all models
prediction1 = clf1.predict(X_test_tfidf)
prediction2 = clf2.predict(X_test_strat_tfidf)
prediction3 = clf1.predict(X_test_strat_tfidf)
prediction4 = clf2.predict(X_test_tfidf)

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Skewed training and skewed test data:')
print('Accuracy score: {}'.format(accuracy_score(y_test, prediction1)))
print('precision score: {}'.format(precision_score(y_test, prediction1, average='macro')))
print('recall score: {}'.format(recall_score(y_test, prediction1,average='macro')))
print('f1 score: {}'.format(f1_score(y_test, prediction1,average='macro')))

print('Balanced training and balanced test data:')
print('Accuracy score: {}'.format(accuracy_score(strat_y_test, prediction2)))
print('precision score: {}'.format(precision_score(strat_y_test, prediction2, average='macro')))
print('recall score: {}'.format(recall_score(strat_y_test, prediction2,average='macro')))
print('f1 score: {}'.format(f1_score(strat_y_test, prediction2,average='macro')))

print('Skewed training and balanced test data:')
print('Accuracy score: {}'.format(accuracy_score(strat_y_test, prediction3)))
print('precision score: {}'.format(precision_score(strat_y_test, prediction3, average='macro')))
print('recall score: {}'.format(recall_score(strat_y_test, prediction3,average='macro')))
print('f1 score: {}'.format(f1_score(strat_y_test, prediction3,average='macro')))

print('Balanced training and skewed test data:')
print('Accuracy score: {}'.format(accuracy_score(y_test, prediction4)))
print('precision score: {}'.format(precision_score(y_test, prediction4, average='macro')))
print('recall score: {}'.format(recall_score(y_test, prediction4,average='macro')))
print('f1 score: {}'.format(f1_score(y_test, prediction4,average='macro')))

Skewed training and skewed test data:
Accuracy score: 0.18841522941750732
precision score: 0.12013698308344199
recall score: 0.1239743724620403
f1 score: 0.11669090904506084
Balanced training and balanced test data:
Accuracy score: 0.3285625
precision score: 0.32875242640183305
recall score: 0.3285625
f1 score: 0.32573458777531394
Skewed training and balanced test data:
Accuracy score: 0.119625
precision score: 0.09734440435696556
recall score: 0.119625
f1 score: 0.09118239321505286
Balanced training and skewed test data:
Accuracy score: 0.31727953140253823
precision score: 0.2857202742345694
recall score: 0.3195375270229954
f1 score: 0.28544041763898


#### Real world data

In [22]:
dfReal = pd.read_csv('realWorldEmotions.csv')
# Removing punctuation, URL, and tags
dfReal["Text"] = dfReal.Text.apply(lambda x: cleanText(x))
dfReal['Text'] = dfReal['Text'].apply(lambda x: re.sub("(@[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)",'',x))
#drop na values
dfReal.dropna(inplace=True)
#stem the text
dfReal['Text'] = dfReal['Text'].apply(lambda x: replace(x))
dfReal['Sentiment'] = np.where((dfReal.Sentiment == 'joy'),'happy', dfReal.Sentiment)

In [23]:
# Conversion to document matrix 
X = dfReal["Text"]
y = dfReal["Sentiment"]
# count vectorizer
X_count = count_vect.transform(X)
# Tfidf transformer
X_tfidf = tfidf_transformer.transform(X_count)

In [24]:
# prediction on unbalanced trained model
prediction_real1 = clf1.predict(X_tfidf)
# prediction on balanced trained model
prediction_real2 = clf2.predict(X_tfidf)

In [25]:
print('Skewed training model tested on real data:')
print('Accuracy score: {}'.format(accuracy_score(y, prediction_real1)))
print('precision score: {}'.format(precision_score(y, prediction_real1, average='macro')))
print('recall score: {}'.format(recall_score(y, prediction_real1,average='macro')))
print('f1 score: {}'.format(f1_score(y, prediction_real1,average='macro')))

print('Balanced training model tested on real data:')
print('Accuracy score: {}'.format(accuracy_score(y, prediction_real2)))
print('precision score: {}'.format(precision_score(y, prediction_real2, average='macro')))
print('recall score: {}'.format(recall_score(y, prediction_real2,average='macro')))
print('f1 score: {}'.format(f1_score(y, prediction_real2,average='macro')))

Skewed training model tested on real data:
Accuracy score: 0.1875
precision score: 0.14949568091452797
recall score: 0.10520160022368999
f1 score: 0.11379822978248186
Balanced training model tested on real data:
Accuracy score: 0.6135
precision score: 0.4795261846978647
recall score: 0.4398708967753452
f1 score: 0.42573535010877617


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM

### One vs Rest

### One vs One