# Challenge 3 - Sentiment Analysis

data: https://www.kaggle.com/kazanova/sentiment140

example: https://www.kaggle.com/ngyptr/python-nltk-sentiment-analysis

https://zablo.net/blog/post/twitter-sentiment-analysis-python-scikit-word2vec-nltk-xgboost

https://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/

## Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords

from nltk.sentiment import SentimentAnalyzer

NUM_FEATURES = 5000
TEST_SET_SIZE = 20000

## Load Data & Data Exploration

In [2]:
raw_data = pd.read_csv('Sentiment140.csv', encoding="ISO-8859-1")

# data = raw_data.head(10000).append(raw_data.tail(10000))

data = raw_data.sample(TEST_SET_SIZE)

In [3]:
data.target.value_counts()

4    10008
0     9992
Name: target, dtype: int64

In [4]:
data.isna().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

## Cleaning

* Generic text
    * Missing values
    * Special characters
    * Numbers
* Web text
    * URLs
    * HTML tags
* Twitter-specific
    * Twitter usernames with handler (optional)
    * Hashtags (optional)


### References

* Replace regex in substring: https://stackoverflow.com/questions/5658369/how-to-input-a-regex-in-string-replace

In [5]:
import re

def remove_by_regex(tweet, regexp):
        return re.sub(regexp, '', tweet)

def remove_url(tweet):
    return remove_by_regex(tweet, re.compile(r"http.?://[^\s]+[\s]?"))

def remove_special_char(tweet):
    return re.sub(r"[^a-zA-Z0-9 ]", " ", tweet)#add space placeholder

def remove_numbers(tweet):
    return remove_by_regex(tweet, re.compile(r"[1234567890]"))

def clean_up(tweet):
    tweet = remove_numbers(tweet)
    tweet = remove_url(tweet)
    tweet = remove_special_char(tweet)
    return tweet.lower().strip()

clean_up("@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]")

'ironhack s  q website  is'

In [6]:
data.text = data.text.apply(lambda tweet: clean_up(tweet))

data.head()

Unnamed: 0,target,id,date,flag,user,text
268623,0,1989444030,Mon Jun 01 01:33:48 PDT 2009,NO_QUERY,csummers1610,is hacked off that he s just found the perfect...
279503,0,1991806787,Mon Jun 01 07:41:52 PDT 2009,NO_QUERY,enviromaverick,overslept again today
730925,0,2263695952,Sun Jun 21 01:32:37 PDT 2009,NO_QUERY,ginnielizz,a garbage truck just tried to run me over
1493559,4,2069529931,Sun Jun 07 16:02:47 PDT 2009,NO_QUERY,PrincessofNY,if u can tell me what i m doin right no ill fo...
162603,0,1957583820,Fri May 29 00:57:36 PDT 2009,NO_QUERY,shriggles,can t believe he has to wait till october to s...


## Tokenization

In [7]:
from nltk.tokenize import word_tokenize

# https://www.nltk.org/api/nltk.tokenize.html
# tknzr = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) 

data['text_processed'] = data.text.apply(lambda x: word_tokenize(x))

data.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
268623,0,1989444030,Mon Jun 01 01:33:48 PDT 2009,NO_QUERY,csummers1610,is hacked off that he s just found the perfect...,"[is, hacked, off, that, he, s, just, found, th..."
279503,0,1991806787,Mon Jun 01 07:41:52 PDT 2009,NO_QUERY,enviromaverick,overslept again today,"[overslept, again, today]"
730925,0,2263695952,Sun Jun 21 01:32:37 PDT 2009,NO_QUERY,ginnielizz,a garbage truck just tried to run me over,"[a, garbage, truck, just, tried, to, run, me, ..."
1493559,4,2069529931,Sun Jun 07 16:02:47 PDT 2009,NO_QUERY,PrincessofNY,if u can tell me what i m doin right no ill fo...,"[if, u, can, tell, me, what, i, m, doin, right..."
162603,0,1957583820,Fri May 29 00:57:36 PDT 2009,NO_QUERY,shriggles,can t believe he has to wait till october to s...,"[can, t, believe, he, has, to, wait, till, oct..."


## Stemming & Lemmatization

In [8]:
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

# Porter vs Snowball vs Lancaster
#https://stackoverflow.com/questions/10554052/what-are-the-major-differences-and-benefits-of-porter-and-lancaster-stemming-alg
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize(tweet):
    return list(map(lambda s: lemmatizer.lemmatize(stemmer.stem(s)), tweet))

stem_and_lemmatize(['students', 'loves', 'tokenization', 'new york', 'ourselves', 'tuberculosis', 'was', 'liked'])

['student',
 'love',
 'token',
 'new york',
 'ourselv',
 'tuberculosi',
 'wa',
 'like']

In [9]:
data.text_processed = data.text_processed.apply(lambda s: stem_and_lemmatize(s))

data.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
268623,0,1989444030,Mon Jun 01 01:33:48 PDT 2009,NO_QUERY,csummers1610,is hacked off that he s just found the perfect...,"[is, hack, off, that, he, s, just, found, the,..."
279503,0,1991806787,Mon Jun 01 07:41:52 PDT 2009,NO_QUERY,enviromaverick,overslept again today,"[overslept, again, today]"
730925,0,2263695952,Sun Jun 21 01:32:37 PDT 2009,NO_QUERY,ginnielizz,a garbage truck just tried to run me over,"[a, garbag, truck, just, tri, to, run, me, over]"
1493559,4,2069529931,Sun Jun 07 16:02:47 PDT 2009,NO_QUERY,PrincessofNY,if u can tell me what i m doin right no ill fo...,"[if, u, can, tell, me, what, i, m, doin, right..."
162603,0,1957583820,Fri May 29 00:57:36 PDT 2009,NO_QUERY,shriggles,can t believe he has to wait till october to s...,"[can, t, believ, he, ha, to, wait, till, octob..."


## Removing Stopwords

In [10]:
stopwords_list = stopwords.words("english")

# stopwords_list

In [11]:
def remove_stopwords(tweet):
    return [x for x in tweet if x not in stopwords_list]

data.text_processed = data.text_processed.apply(lambda s: remove_stopwords(s))

data.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
268623,0,1989444030,Mon Jun 01 01:33:48 PDT 2009,NO_QUERY,csummers1610,is hacked off that he s just found the perfect...,"[hack, found, perfect, job, relev, degre, dead..."
279503,0,1991806787,Mon Jun 01 07:41:52 PDT 2009,NO_QUERY,enviromaverick,overslept again today,"[overslept, today]"
730925,0,2263695952,Sun Jun 21 01:32:37 PDT 2009,NO_QUERY,ginnielizz,a garbage truck just tried to run me over,"[garbag, truck, tri, run]"
1493559,4,2069529931,Sun Jun 07 16:02:47 PDT 2009,NO_QUERY,PrincessofNY,if u can tell me what i m doin right no ill fo...,"[u, tell, doin, right, ill, follow, u]"
162603,0,1957583820,Fri May 29 00:57:36 PDT 2009,NO_QUERY,shriggles,can t believe he has to wait till october to s...,"[believ, ha, wait, till, octob, see, quot, quo..."


In [12]:
all_words = []

for index, value in data.text_processed.iteritems():
    if value not in all_words:
        all_words += value

top_features = list(nltk.FreqDist(all_words).keys())[:NUM_FEATURES]

In [13]:
top_features

['hack',
 'found',
 'perfect',
 'job',
 'relev',
 'degre',
 'deadlin',
 'wa',
 'week',
 'ago',
 'overslept',
 'today',
 'garbag',
 'truck',
 'tri',
 'run',
 'u',
 'tell',
 'doin',
 'right',
 'ill',
 'follow',
 'believ',
 'ha',
 'wait',
 'till',
 'octob',
 'see',
 'quot',
 'american',
 'get',
 'weekend',
 'wish',
 'new',
 'sim',
 'wasnt',
 'expens',
 'realli',
 'need',
 'play',
 'prep',
 'audit',
 'time',
 'go',
 'home',
 'dinner',
 'someon',
 'veri',
 'special',
 'bonddiiiieee',
 'gooooo',
 'stephda',
 'bonestodust',
 'yo',
 'dude',
 'feel',
 'lone',
 'hous',
 'leav',
 'group',
 'love',
 'fellowship',
 'laptop',
 'yahoooo',
 'handshak',
 'know',
 'cassandra',
 'physic',
 'comput',
 'hang',
 'girl',
 'guy',
 'jessicahaley',
 'trey',
 'cinespac',
 'rt',
 'bbgirl',
 'whi',
 'tomorrrrrrooww',
 'caus',
 'like',
 'dryforkpastor',
 'hope',
 'everyth',
 'okay',
 'oh',
 'got',
 'bye',
 'twitter',
 'tom',
 'behind',
 'mishab',
 'heh',
 'grab',
 'could',
 'probabl',
 'becaus',
 'fit',
 'manschutz

## Building Features

https://pythonprogramming.net/words-as-features-nltk-tutorial/

https://pythonprogramming.net/naive-bayes-classifier-nltk-tutorial/

In [14]:
def build_features(words):
    features = {}
    for w in top_features:
        features[w] = (w in words)
    return features

In [15]:
featuresets = []

for index, row in data.iterrows():
    featuresets.append((build_features(row['text_processed']), row['target']==4))

In [16]:
featuresets[:10]

[({'hack': True,
   'found': True,
   'perfect': True,
   'job': True,
   'relev': True,
   'degre': True,
   'deadlin': True,
   'wa': True,
   'week': True,
   'ago': True,
   'overslept': False,
   'today': False,
   'garbag': False,
   'truck': False,
   'tri': False,
   'run': False,
   'u': False,
   'tell': False,
   'doin': False,
   'right': False,
   'ill': False,
   'follow': False,
   'believ': False,
   'ha': False,
   'wait': False,
   'till': False,
   'octob': False,
   'see': False,
   'quot': False,
   'american': False,
   'get': False,
   'weekend': False,
   'wish': False,
   'new': False,
   'sim': False,
   'wasnt': False,
   'expens': False,
   'realli': False,
   'need': False,
   'play': False,
   'prep': False,
   'audit': False,
   'time': False,
   'go': False,
   'home': False,
   'dinner': False,
   'someon': False,
   'veri': False,
   'special': False,
   'bonddiiiieee': False,
   'gooooo': False,
   'stephda': False,
   'bonestodust': False,
   'yo': F

In [17]:
data['target'].value_counts()

4    10008
0     9992
Name: target, dtype: int64

## Model Training

In [19]:
training, test = train_test_split(featuresets, test_size=0.2)

In [20]:
classifier = nltk.NaiveBayesClassifier.train(training)

In [21]:
classifier.show_most_informative_features()

Most Informative Features
            followfriday = True             True : False  =     16.7 : 1.0
                 horribl = True            False : True   =     16.6 : 1.0
                     sad = True            False : True   =     15.2 : 1.0
                   fever = True            False : True   =     13.9 : 1.0
                  welcom = True             True : False  =     13.4 : 1.0
                  broken = True            False : True   =     12.8 : 1.0
                    kill = True            False : True   =     11.6 : 1.0
                 headach = True            False : True   =     10.8 : 1.0
               recommend = True             True : False  =     10.2 : 1.0
                   worst = True            False : True   =     10.0 : 1.0


## Model Evaluation

In [22]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, test))*100.00)

Classifier accuracy percent: 72.85000000000001


## Prediction