In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import random
import pickle

import warnings
warnings.filterwarnings("ignore")

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zroy1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime Anywhere Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you",
    "ILU": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My Ass Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The Ass",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My Ass Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age Sex Location",
    "THX": "Thank You",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The Fuck",
    "WTG": "Way To Go",
    "WUF": "Where Are You From",
    "W8": "Wait",
    "7K": "Sick Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I dont care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping bored tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Cant stop laughing"
}

In [3]:
def clean_text(text):
    
    text = re.sub(r"[!@#$(),\n%^*?.'\:;~`0-9]", '', str(text))
    text = text.split(" ")
    for i, word in enumerate(text):
        if word.upper() in chat_words:
            text[i] = chat_words[word.upper()]

    text = " ".join(word for word in text).lower()

    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text, language='english', preserve_line=True)
    text = " ".join(word for word in word_tokens if not word.lower() in stop_words)

    
    return text

In [4]:
def prediction(text, model, cv):
    text = clean_text(text)
    X = cv.transform([text]).toarray()
    return model.predict_proba(X)

# Positivity Dataset

In [5]:
filename = "positivity.csv"
positivity = pd.read_csv(filename, encoding="ISO-8859-1")
positivity

Unnamed: 0.1,Unnamed: 0,sentiment,id,date,flag,user,text
0,0,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
1,1,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
2,2,0,1467812964,Mon Apr 06 22:20:22 PDT 2009,NO_QUERY,lovesongwriter,Hollis' death scene will hurt me severely to w...
3,3,0,1467825863,Mon Apr 06 22:23:43 PDT 2009,NO_QUERY,BrookeAmanda,i really hate how people diss my bands! Trace...
4,4,0,1467826052,Mon Apr 06 22:23:45 PDT 2009,NO_QUERY,paulseverio,Why won't you show my location?! http://twit...
...,...,...,...,...,...,...,...
49995,49995,1,2193552981,Tue Jun 16 08:36:49 PDT 2009,NO_QUERY,JDenouden,@NLPride08 that's a perfectly reasonable time ...
49996,49996,1,2193554410,Tue Jun 16 08:36:56 PDT 2009,NO_QUERY,Arr0wsmith,@Jihav actually my other PSN ID is GUNSnSPEEDO...
49997,49997,1,2193576655,Tue Jun 16 08:38:46 PDT 2009,NO_QUERY,eratyptin,"@siahoney I am good thanks! How is #Eric, I..."
49998,49998,1,2193577315,Tue Jun 16 08:38:49 PDT 2009,NO_QUERY,jamie_ox,doing another piece of media homework is that ...


In [6]:
positivity_X = positivity["text"]
positivity_y = positivity["sentiment"]

In [7]:
positivity_texts = []
for text in positivity_X:
    positivity_texts.append(clean_text(text))
positivity_texts

['whole body feels itchy like fire',
 'loltrish hey long tears eyes see yes rains bit bit laughing loud im fine thanks hows',
 'hollis death scene hurt severely watch film wry directors cut',
 'really hate people diss bands trace clearly ugly',
 'wont show location http//twitpiccom/yes',
 'kpreyes remember bum leg strikes back tears eyes serious',
 'ozesteph shame hear stephan',
 'caitlinoconnor want tacos margarhitas telll gay say hello & lt',
 'missing best friends forever watching home away reminds & lt shout courts',
 'chriscantore congrats im totally jealous wish xm working',
 'marge_inovera tried tweetdeck hated passion hated im sure',
 'danadearmond',
 'trolley packed like sardines padre game remove car good call mts',
 'mamasvan laughing loud - nope complete camerafail',
 'b_barnett really see coming',
 'nicolerichie oh yes miss',
 'eazydoesit negative lost vote confidence',
 'sofii_noel thatã¯â¿â½s bad',
 'crap need dresses',
 'huuuge headache omg feel like crap',
 'dammit nee

In [8]:
# vectorize
positivity_cv = CountVectorizer(token_pattern=r'[^\s]+', max_features=7500)
positivity_X = positivity_cv.fit_transform(positivity_texts).toarray()
positivity_X.shape

(50000, 7500)

In [9]:
positivity_x_train, positivity_x_test, positivity_y_train, positivity_y_test = train_test_split(positivity_X, positivity_y, test_size = 0.25)

In [10]:
positivity_model = MultinomialNB()
positivity_model.fit(positivity_x_train, positivity_y_train)

In [11]:
positivity_y_pred = positivity_model.predict(positivity_x_test)
accuracy = accuracy_score(positivity_y_test, positivity_y_pred)
accuracy

0.75368

In [12]:
message = "This sucks"
prediction(message, positivity_model, positivity_cv)

array([[0.90922377, 0.09077623]])

In [23]:
positivity_model_filename = "positivity_model.pickle"
pickle.dump(positivity_model, open(positivity_model_filename, "wb"))
positivity_cv_filename = "positivity_cv.pickle"
pickle.dump(positivity_cv, open(positivity_cv_filename, "wb"))

# Emotions dataset

In [13]:
n = 416809 #number of records in file
s = 50000 #desired sample size
filename = "emotions.csv"
skip = sorted(random.sample(range(n),n-s))
labels = ["index", "text", "label"]
emotions = pd.read_csv(filename, skiprows=skip, encoding="ISO-8859-1", names=labels).drop(columns="index")
emotions

Unnamed: 0,text,label
0,i dont know i feel so lost,0
1,i take a shower i feel wonderful energetic and...,1
2,i feel assured that i m doing so much right an...,1
3,i havent done my eye make up i hate anyone see...,0
4,i have wished her the best and i truly feel as...,1
...,...,...
49996,i choose to voice my feelings people are offen...,3
49997,im used to feeling isolated and a million mile...,0
49998,im feeling inspired by their drama today,1
49999,i feel like telling these horny devils to find...,2


In [14]:
emotions_X = emotions["text"]
emotions_y = emotions["label"]

In [15]:
emotions["label"].value_counts()

1    16997
0    14554
3     6844
4     5687
2     4074
5     1845
Name: label, dtype: int64

In [16]:
emotions_text = []
for text in emotions_X:
    emotions_text.append(clean_text(text))
emotions_text

['dont know feel lost',
 'take shower feel wonderful energetic previous feelings life turn awesome feeling creating life like happiest life world',
 'feel assured much right alone',
 'havent done eye make hate anyone seeing feel nekkid sounds really vain think look horrendous without',
 'wished best truly feel though sincere',
 'feeling lousy wondering able keep vlcd part hcg cycle',
 'feel lot less hostile',
 'miss feeling special girl hed take date',
 'know im going disappoint ive decided volunteer year fear ill end feeling resentful',
 'feel lauterbach victimized',
 'put mask come work suppress emptiness feel inside pain loneliness bitter jaded woman really',
 'able lose three kilo feeling energetic even month old things seem track',
 'feel free express without inhibition',
 'realizing school soon',
 'feel stupid felt way feel even stupider even word',
 'feel better worse',
 'feel like ive pressured tell',
 'feel many artistic motivations hard keep',
 'took mg thinking took already 

In [17]:
emotions_cv = CountVectorizer(token_pattern=r'[^\s]+', max_features=7500)
emotions_X = emotions_cv.fit_transform(emotions_text).toarray()
emotions_X.shape

(50001, 7500)

In [18]:
emotions_x_train, emotions_x_test, emotions_y_train, emotions_y_test = train_test_split(emotions_X, emotions_y, test_size = 0.25)

In [19]:
emotions_model = MultinomialNB()
emotions_model.fit(emotions_x_train, emotions_y_train)

In [20]:
emotions_y_pred = emotions_model.predict(emotions_x_test)
accuracy = accuracy_score(emotions_y_test, emotions_y_pred)
accuracy

0.8618510519158468

In [23]:
# Six categories: sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5).
message = "This Sucks!"
prediction(message, emotions_model, emotions_cv)

array([[0.24956348, 0.22524736, 0.09779449, 0.26053409, 0.1554596 ,
        0.01140098]])

In [80]:
emotion_model_filename = "emotion_model.pickle"
pickle.dump(emotions_model, open(emotion_model_filename, "wb"))
emotion_cv_filename = "emotion_cv.pickle"
pickle.dump(emotions_cv, open(emotion_cv_filename, "wb"))