# Challenge 3 - Sentiment Analysis

data: https://www.kaggle.com/kazanova/sentiment140

example: https://www.kaggle.com/ngyptr/python-nltk-sentiment-analysis

https://zablo.net/blog/post/twitter-sentiment-analysis-python-scikit-word2vec-nltk-xgboost

https://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/

## Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords

from nltk.sentiment import SentimentAnalyzer

NUM_FEATURES = 5000
TEST_SET_SIZE = 20000

## Load Data & Data Exploration

In [2]:
raw_data = pd.read_csv('Sentiment140.csv', encoding="ISO-8859-1")

# data = raw_data.head(10000).append(raw_data.tail(10000))

data = raw_data.sample(TEST_SET_SIZE)

In [3]:
data.target.value_counts()

0    10044
4     9956
Name: target, dtype: int64

In [4]:
data.isna().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

## Cleaning

* Generic text
    * Missing values
    * Special characters
    * Numbers
* Web text
    * URLs
    * HTML tags
* Twitter-specific
    * Twitter usernames with handler (optional)
    * Hashtags (optional)


### References

* Replace regex in substring: https://stackoverflow.com/questions/5658369/how-to-input-a-regex-in-string-replace

In [5]:
import re

def remove_by_regex(tweet, regexp):
        return re.sub(regexp, '', tweet)

def remove_url(tweet):
    return remove_by_regex(tweet, re.compile(r"http.?://[^\s]+[\s]?"))

def remove_special_char(tweet):
    return re.sub(r"[^a-zA-Z0-9 ]", " ", tweet)#add space placeholder

def remove_numbers(tweet):
    return remove_by_regex(tweet, re.compile(r"[1234567890]"))

def clean_up(tweet):
    tweet = remove_numbers(tweet)
    tweet = remove_url(tweet)
    tweet = remove_special_char(tweet)
    return tweet.lower().strip()

clean_up("@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]")

'ironhack s  q website  is'

In [6]:
data.text = data.text.apply(lambda tweet: clean_up(tweet))

data.head()

Unnamed: 0,target,id,date,flag,user,text
747160,0,2283470553,Mon Jun 22 12:40:08 PDT 2009,NO_QUERY,Gibbo2910,timelord uke awww why not its cool
905846,4,1695220322,Mon May 04 05:33:32 PDT 2009,NO_QUERY,nadiaishere,finally im back online i miss my lappy so m...
131359,0,1835516465,Mon May 18 06:10:11 PDT 2009,NO_QUERY,eleni__,one more hour of being yrs old ur only once...
1444993,4,2062327669,Sat Jun 06 22:55:34 PDT 2009,NO_QUERY,natwebb,mmmmmm i had one chicken rissole with little ...
127208,0,1834675017,Mon May 18 03:49:21 PDT 2009,NO_QUERY,bartTC,wants that clevercss is supported native by ev...


## Tokenization

In [7]:
from nltk.tokenize import word_tokenize

# https://www.nltk.org/api/nltk.tokenize.html
# tknzr = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) 

data['text_processed'] = data.text.apply(lambda x: word_tokenize(x))

data.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
747160,0,2283470553,Mon Jun 22 12:40:08 PDT 2009,NO_QUERY,Gibbo2910,timelord uke awww why not its cool,"[timelord, uke, awww, why, not, its, cool]"
905846,4,1695220322,Mon May 04 05:33:32 PDT 2009,NO_QUERY,nadiaishere,finally im back online i miss my lappy so m...,"[finally, im, back, online, i, miss, my, lappy..."
131359,0,1835516465,Mon May 18 06:10:11 PDT 2009,NO_QUERY,eleni__,one more hour of being yrs old ur only once...,"[one, more, hour, of, being, yrs, old, ur, onl..."
1444993,4,2062327669,Sat Jun 06 22:55:34 PDT 2009,NO_QUERY,natwebb,mmmmmm i had one chicken rissole with little ...,"[mmmmmm, i, had, one, chicken, rissole, with, ..."
127208,0,1834675017,Mon May 18 03:49:21 PDT 2009,NO_QUERY,bartTC,wants that clevercss is supported native by ev...,"[wants, that, clevercss, is, supported, native..."


## Stemming & Lemmatization

In [8]:
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

# Porter vs Snowball vs Lancaster
#https://stackoverflow.com/questions/10554052/what-are-the-major-differences-and-benefits-of-porter-and-lancaster-stemming-alg
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize(tweet):
    return list(map(lambda s: lemmatizer.lemmatize(stemmer.stem(s)), tweet))

stem_and_lemmatize(['students', 'loves', 'tokenization', 'new york', 'ourselves', 'tuberculosis', 'was', 'liked'])

['student',
 'love',
 'token',
 'new york',
 'ourselv',
 'tuberculosi',
 'wa',
 'like']

In [9]:
data.text_processed = data.text_processed.apply(lambda s: stem_and_lemmatize(s))

data.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
747160,0,2283470553,Mon Jun 22 12:40:08 PDT 2009,NO_QUERY,Gibbo2910,timelord uke awww why not its cool,"[timelord, uke, awww, whi, not, it, cool]"
905846,4,1695220322,Mon May 04 05:33:32 PDT 2009,NO_QUERY,nadiaishere,finally im back online i miss my lappy so m...,"[final, im, back, onlin, i, miss, my, lappi, s..."
131359,0,1835516465,Mon May 18 06:10:11 PDT 2009,NO_QUERY,eleni__,one more hour of being yrs old ur only once...,"[one, more, hour, of, be, yr, old, ur, onli, o..."
1444993,4,2062327669,Sat Jun 06 22:55:34 PDT 2009,NO_QUERY,natwebb,mmmmmm i had one chicken rissole with little ...,"[mmmmmm, i, had, one, chicken, rissol, with, l..."
127208,0,1834675017,Mon May 18 03:49:21 PDT 2009,NO_QUERY,bartTC,wants that clevercss is supported native by ev...,"[want, that, clevercss, is, support, nativ, by..."


## Removing Stopwords

In [10]:
stopwords_list = stopwords.words("english")

# stopwords_list

In [11]:
def remove_stopwords(tweet):
    return [x for x in tweet if x not in stopwords_list]

data.text_processed = data.text_processed.apply(lambda s: remove_stopwords(s))

data.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
747160,0,2283470553,Mon Jun 22 12:40:08 PDT 2009,NO_QUERY,Gibbo2910,timelord uke awww why not its cool,"[timelord, uke, awww, whi, cool]"
905846,4,1695220322,Mon May 04 05:33:32 PDT 2009,NO_QUERY,nadiaishere,finally im back online i miss my lappy so m...,"[final, im, back, onlin, miss, lappi, mani, th..."
131359,0,1835516465,Mon May 18 06:10:11 PDT 2009,NO_QUERY,eleni__,one more hour of being yrs old ur only once...,"[one, hour, yr, old, ur, onli, onc, went, quick]"
1444993,4,2062327669,Sat Jun 06 22:55:34 PDT 2009,NO_QUERY,natwebb,mmmmmm i had one chicken rissole with little ...,"[mmmmmm, one, chicken, rissol, littl, babi, po..."
127208,0,1834675017,Mon May 18 03:49:21 PDT 2009,NO_QUERY,bartTC,wants that clevercss is supported native by ev...,"[want, clevercss, support, nativ, everi, webbr..."


In [14]:
all_words = []

for index, value in data.text_processed.iteritems():
    if value not in all_words:
        all_words += value

top_features = [x[0] for x in nltk.FreqDist(all_words).most_common(NUM_FEATURES)]

# top_features = list(nltk.FreqDist(all_words).keys())[:NUM_FEATURES]

In [15]:
top_features

['go',
 'get',
 'wa',
 'day',
 'good',
 'work',
 'like',
 'love',
 'got',
 'quot',
 'today',
 'time',
 'u',
 'thank',
 'back',
 'miss',
 'want',
 'know',
 'lol',
 'one',
 'realli',
 'see',
 'feel',
 'think',
 'im',
 'amp',
 'still',
 'night',
 'hope',
 'well',
 'watch',
 'need',
 'new',
 'make',
 'home',
 'na',
 'oh',
 'ha',
 'twitter',
 'come',
 'last',
 'look',
 'much',
 'wish',
 'great',
 'tomorrow',
 'sad',
 'morn',
 'sleep',
 'wait',
 'tri',
 'haha',
 'veri',
 'fun',
 'whi',
 'follow',
 'sorri',
 'bad',
 'right',
 'happi',
 'friend',
 'would',
 'onli',
 'tonight',
 'week',
 'way',
 'say',
 'though',
 'nice',
 'hate',
 'gon',
 'bed',
 'yeah',
 'guy',
 'take',
 'better',
 'school',
 'could',
 'thing',
 'hour',
 'even',
 'start',
 'show',
 'tweet',
 'use',
 'awesom',
 'lt',
 'peopl',
 'weekend',
 'hey',
 'ok',
 'play',
 'x',
 'next',
 'yes',
 'final',
 'everyon',
 'plea',
 'let',
 'first',
 'long',
 'best',
 'tire',
 'help',
 'movi',
 'never',
 'cant',
 'year',
 'sure',
 'soon',
 'ca

## Building Features

https://pythonprogramming.net/words-as-features-nltk-tutorial/

https://pythonprogramming.net/naive-bayes-classifier-nltk-tutorial/

In [16]:
def build_features(words):
    features = {}
    for w in top_features:
        features[w] = (w in words)
    return features

In [17]:
featuresets = []

for index, row in data.iterrows():
    featuresets.append((build_features(row['text_processed']), row['target']==4))

In [18]:
featuresets[:10]

[({'go': False,
   'get': False,
   'wa': False,
   'day': False,
   'good': False,
   'work': False,
   'like': False,
   'love': False,
   'got': False,
   'quot': False,
   'today': False,
   'time': False,
   'u': False,
   'thank': False,
   'back': False,
   'miss': False,
   'want': False,
   'know': False,
   'lol': False,
   'one': False,
   'realli': False,
   'see': False,
   'feel': False,
   'think': False,
   'im': False,
   'amp': False,
   'still': False,
   'night': False,
   'hope': False,
   'well': False,
   'watch': False,
   'need': False,
   'new': False,
   'make': False,
   'home': False,
   'na': False,
   'oh': False,
   'ha': False,
   'twitter': False,
   'come': False,
   'last': False,
   'look': False,
   'much': False,
   'wish': False,
   'great': False,
   'tomorrow': False,
   'sad': False,
   'morn': False,
   'sleep': False,
   'wait': False,
   'tri': False,
   'haha': False,
   'veri': False,
   'fun': False,
   'whi': True,
   'follow': False,
 

In [19]:
data['target'].value_counts()

0    10044
4     9956
Name: target, dtype: int64

## Model Training

In [20]:
training, test = train_test_split(featuresets, test_size=0.2)

In [21]:
classifier = nltk.NaiveBayesClassifier.train(training)

In [22]:
classifier.show_most_informative_features()

Most Informative Features
                   upset = True            False : True   =     24.1 : 1.0
                  throat = True            False : True   =     20.8 : 1.0
                     ugh = True            False : True   =     19.5 : 1.0
                  cancel = True            False : True   =     16.8 : 1.0
                 terribl = True            False : True   =     14.8 : 1.0
                  welcom = True             True : False  =     14.4 : 1.0
                     www = True             True : False  =     14.3 : 1.0
                unfortun = True            False : True   =     13.6 : 1.0
                   stuck = True            False : True   =     13.4 : 1.0
                     sad = True            False : True   =     13.3 : 1.0


## Model Evaluation

In [23]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, test))*100.00)

Classifier accuracy percent: 72.39999999999999


## Prediction