# Twitter Sentiment Analysis

## Importing necessary libraries

In [1]:
# loading the data
import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt

# data preprocessing
from html import unescape
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# building the model
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# serialization
import gzip
import dill

## Loading the data

In [2]:
# load the data
df = pd.read_csv('data/Sentiment-Analysis-Dataset.zip', error_bad_lines=False)

df.head()

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...


## Creating functions for data preprocessing

In [None]:
# alternative: from nltk import TweetTokenizer
def preprocessor(doc):
    return unescape(doc).lower()

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'tagger'])

def lemmatizer(doc):
    return [word.lemma_ for word in nlp(doc)]

## Building the model

In [None]:
vectorizer = HashingVectorizer(preprocessor=preprocessor,
                               alternate_sign=False,
                               #tokenizer=lemmatizer, 
                               #ngram_range=(1,2),
                               stop_words=STOP_WORDS)

clf = MultinomialNB()
pipe = Pipeline([('vectorizer', vectorizer), ('classifier', clf)])

In [3]:
X = df['SentimentText']
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
pipe.fit(X_train, y_train)

In [None]:
print('Training R^2: {}'.format(pipe.score(X_train, y_train)))
print('Testing R^2: {}'.format(pipe.score(X_test, y_test)))

## Serialize the model to persistent storage

In [None]:
%%bash
ls -alh sentiment_model.dill.gz

In [4]:
with gzip.open('sentiment_model.dill.gz', 'rb') as f:
    model = dill.load(f)

print('Persistent model testing R^2: {}'.format(model.score(X_test, y_test)))

Persistent model testing R^2: 0.7699090658583632
