In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import random
import pickle

import warnings
warnings.filterwarnings("ignore")

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zroy1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def clean_text(text):
    text = re.sub(r"[!@#$(),\n%^*?.'\:;~`0-9]", '', str(text))
    text = text.lower()

    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text, language='english', preserve_line=True)
    text = " ".join(word for word in word_tokens if not word.lower() in stop_words)
    
    return text

In [4]:
def prediction(text, model, cv):
    text = clean_text(text)
    X = cv.transform([text]).toarray()
    return model.predict_proba(X)

# Positivity Dataset

In [82]:
filename = "positivity.csv"
positivity = pd.read_csv(filename, encoding="ISO-8859-1")
positivity

Unnamed: 0.1,Unnamed: 0,sentiment,id,date,flag,user,text
0,0,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
1,1,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
2,2,0,1467812964,Mon Apr 06 22:20:22 PDT 2009,NO_QUERY,lovesongwriter,Hollis' death scene will hurt me severely to w...
3,3,0,1467825863,Mon Apr 06 22:23:43 PDT 2009,NO_QUERY,BrookeAmanda,i really hate how people diss my bands! Trace...
4,4,0,1467826052,Mon Apr 06 22:23:45 PDT 2009,NO_QUERY,paulseverio,Why won't you show my location?! http://twit...
...,...,...,...,...,...,...,...
49995,49995,1,2193552981,Tue Jun 16 08:36:49 PDT 2009,NO_QUERY,JDenouden,@NLPride08 that's a perfectly reasonable time ...
49996,49996,1,2193554410,Tue Jun 16 08:36:56 PDT 2009,NO_QUERY,Arr0wsmith,@Jihav actually my other PSN ID is GUNSnSPEEDO...
49997,49997,1,2193576655,Tue Jun 16 08:38:46 PDT 2009,NO_QUERY,eratyptin,"@siahoney I am good thanks! How is #Eric, I..."
49998,49998,1,2193577315,Tue Jun 16 08:38:49 PDT 2009,NO_QUERY,jamie_ox,doing another piece of media homework is that ...


In [7]:
positivity_X = positivity["text"]
positivity_y = positivity["sentiment"]

In [8]:
positivity_texts = []
for text in positivity_X:
    positivity_texts.append(clean_text(text))
positivity_texts

['whole body feels itchy like fire',
 'loltrish hey long time see yes rains bit bit lol im fine thanks hows',
 'hollis death scene hurt severely watch film wry directors cut',
 'really hate people diss bands trace clearly ugly',
 'wont show location http//twitpiccom/yes',
 'kpreyes remember bum leg strikes back time serious',
 'ozesteph shame hear stephan',
 'caitlinoconnor want tacos margarhitas telll gay say hello & lt',
 'missing bff watching home away reminds & lt shout u courts',
 'chriscantore congrats im totally jealous wish xm working',
 'marge_inovera tried tweetdeck hated passion hated im sure',
 'danadearmond',
 'trolley packed like sardines padre game remove car good call mts',
 'mamasvan lol - nope complete camerafail',
 'b_barnett really see coming',
 'nicolerichie oh yes miss',
 'eazydoesit negative lost vote confidence',
 'sofii_noel thatï¿½s bad',
 'crap need dresses',
 'huuuge headache omg feel like crap',
 'dammit need stop buying furniture',
 'ill cant go cinema',
 

In [9]:
# vectorize
positivity_cv = CountVectorizer(token_pattern=r'[^\s]+', max_features=7500)
positivity_X = positivity_cv.fit_transform(positivity_texts).toarray()
positivity_X.shape

(50000, 7500)

In [11]:
positivity_x_train, positivity_x_test, positivity_y_train, positivity_y_test = train_test_split(positivity_X, positivity_y, test_size = 0.25)

In [13]:
positivity_model = MultinomialNB()
positivity_model.fit(positivity_x_train, positivity_y_train)

In [15]:
positivity_y_pred = positivity_model.predict(positivity_x_test)
accuracy = accuracy_score(positivity_y_test, positivity_y_pred)
accuracy

0.75096

In [18]:
message = "This sucks"
prediction(message, positivity_model, positivity_cv)

array([[0.88535247, 0.11464753]])

In [23]:
positivity_model_filename = "positivity_model.pickle"
pickle.dump(positivity_model, open(positivity_model_filename, "wb"))
positivity_cv_filename = "positivity_cv.pickle"
pickle.dump(positivity_cv, open(positivity_cv_filename, "wb"))

# Emotions dataset

In [70]:
n = 416809 #number of records in file
s = 50000 #desired sample size
filename = "emotions.csv"
skip = sorted(random.sample(range(n),n-s))
labels = ["index", "text", "label"]
emotions = pd.read_csv(filename, skiprows=skip, encoding="ISO-8859-1", names=labels).drop(columns="index")
emotions

Unnamed: 0,text,label
0,i need to feel dangerous and pretty so here a ...,3
1,i don t feel submissive and for the time being...,0
2,i remember feeling after the third bong hit th...,4
3,i have a feeling that has to do with the unfor...,0
4,i was feeling extremely generous last night so...,1
...,...,...
49996,i chose vain beauty initially i wasnt very kee...,1
49997,i feel that i must confess even though it kill...,5
49998,i don t know why today i feel like it looks ve...,1
49999,i don t even feel like i fully resolved it but...,1


In [71]:
emotions_X = emotions["text"]
emotions_y = emotions["label"]

In [72]:
emotions["label"].value_counts()

1    16914
0    14491
3     6999
4     5635
2     4092
5     1870
Name: label, dtype: int64

In [73]:
emotions_text = []
for text in emotions_X:
    emotions_text.append(clean_text(text))
emotions_text

['need feel dangerous pretty striking dance pick deep vogue minutes ago',
 'feel submissive time lost interest bdsm stuff',
 'remember feeling third bong hit pressured',
 'feeling unfortunate fact family cat recently passed away',
 'feeling extremely generous last night companion ate double told lion stomach pipe',
 'feel strong confident intelligent ready step real world',
 'know im going disappoint ive decided volunteer year fear ill end feeling resentful',
 'able lose three kilo feeling energetic even month old things seem track',
 'id good days weeks months kept feeling like cant last life cant sweet',
 'feeling glamourous wear something glamourous feeling relaxed put feel relaxed basically fashion expression person',
 'feel awfully isolated one talk things',
 'really love reading bible feel presence lord jesus feel every single word says also amazed happened jesus sacrifice save us sin also looking back created everything world',
 'feeling bit sceptical',
 'ive come back montreal 

In [74]:
emotions_cv = CountVectorizer(token_pattern=r'[^\s]+', max_features=7500)
emotions_X = emotions_cv.fit_transform(emotions_text).toarray()
emotions_X.shape

(50001, 7500)

In [75]:
emotions_x_train, emotions_x_test, emotions_y_train, emotions_y_test = train_test_split(emotions_X, emotions_y, test_size = 0.25)

In [76]:
emotions_model = MultinomialNB()
emotions_model.fit(emotions_x_train, emotions_y_train)

In [77]:
emotions_y_pred = emotions_model.predict(emotions_x_test)
accuracy = accuracy_score(emotions_y_test, emotions_y_pred)
accuracy

0.863290936725062

In [78]:
# Six categories: sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5).
message = "This Sucks!"
prediction(message, emotions_model, emotions_cv)

array([[0.43022887, 0.16808946, 0.10338223, 0.15828618, 0.1120438 ,
        0.02796946]])

In [80]:
emotion_model_filename = "emotion_model.pickle"
pickle.dump(emotions_model, open(emotion_model_filename, "wb"))
emotion_cv_filename = "emotion_cv.pickle"
pickle.dump(emotions_cv, open(emotion_cv_filename, "wb"))