# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.preprocessing import LabelEncoder
import re

In [2]:
df = pd.read_csv(r'C:\Users\Yasmin\AppData\Roaming\nltk_data\corpora\twitter_data\training.1600000.processed.noemoticon.csv', encoding = 'latin',header = None)
print(df.shape)
df.head()

(1600000, 6)


Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
df.columns = ['Sentiment', 'Id', 'Date', 'Query','User_name', 'Text']

In [4]:
df = df.drop(['Id', 'Date', 'Query', 'User_name'], axis = 1)
df.head()

Unnamed: 0,Sentiment,Text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [5]:
df['Text'] = df.Text.astype('str').map(lambda x: x.lower())

In [6]:
df['Text']

0          @switchfoot http://twitpic.com/2y1zl - awww, t...
1          is upset that he can't update his facebook by ...
2          @kenichan i dived many times for the ball. man...
3            my whole body feels itchy and like its on fire 
4          @nationwideclass no, it's not behaving at all....
                                 ...                        
1599995    just woke up. having no school is the best fee...
1599996    thewdb.com - very cool to hear old walt interv...
1599997    are you ready for your mojo makeover? ask me f...
1599998    happy 38th birthday to my boo of alll time!!! ...
1599999    happy #charitytuesday @thenspcc @sparkscharity...
Name: Text, Length: 1600000, dtype: object

In [7]:
df['Text'] = df.Text.str.replace('[^\w\s]','')

In [8]:
df['Text']

0          switchfoot httptwitpiccom2y1zl  awww thats a b...
1          is upset that he cant update his facebook by t...
2          kenichan i dived many times for the ball manag...
3            my whole body feels itchy and like its on fire 
4          nationwideclass no its not behaving at all im ...
                                 ...                        
1599995    just woke up having no school is the best feel...
1599996    thewdbcom  very cool to hear old walt intervie...
1599997    are you ready for your mojo makeover ask me fo...
1599998    happy 38th birthday to my boo of alll time tup...
1599999    happy charitytuesday thenspcc sparkscharity sp...
Name: Text, Length: 1600000, dtype: object

## Tokenization

In [9]:
df['Text'] = df['Text'].apply(nltk.word_tokenize)

In [10]:
df['Text']

0          [switchfoot, httptwitpiccom2y1zl, awww, thats,...
1          [is, upset, that, he, cant, update, his, faceb...
2          [kenichan, i, dived, many, times, for, the, ba...
3          [my, whole, body, feels, itchy, and, like, its...
4          [nationwideclass, no, its, not, behaving, at, ...
                                 ...                        
1599995    [just, woke, up, having, no, school, is, the, ...
1599996    [thewdbcom, very, cool, to, hear, old, walt, i...
1599997    [are, you, ready, for, your, mojo, makeover, a...
1599998    [happy, 38th, birthday, to, my, boo, of, alll,...
1599999    [happy, charitytuesday, thenspcc, sparkscharit...
Name: Text, Length: 1600000, dtype: object

## Stemming

In [11]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df['Text'] = df['Text'].apply( lambda x:[stemmer.stem(y) for y in x])

In [12]:
df['Text']

0          [switchfoot, httptwitpiccom2y1zl, awww, that, ...
1          [is, upset, that, he, cant, updat, hi, faceboo...
2          [kenichan, i, dive, mani, time, for, the, ball...
3          [my, whole, bodi, feel, itchi, and, like, it, ...
4          [nationwideclass, no, it, not, behav, at, all,...
                                 ...                        
1599995    [just, woke, up, have, no, school, is, the, be...
1599996    [thewdbcom, veri, cool, to, hear, old, walt, i...
1599997    [are, you, readi, for, your, mojo, makeov, ask...
1599998    [happi, 38th, birthday, to, my, boo, of, alll,...
1599999    [happi, charitytuesday, thenspcc, sparkschar, ...
Name: Text, Length: 1600000, dtype: object

## Feature Extraction

In [13]:
df['Text'] = df['Text'].apply( lambda x: ' '.join(x))

In [14]:
df['Text']

0          switchfoot httptwitpiccom2y1zl awww that a bum...
1          is upset that he cant updat hi facebook by tex...
2          kenichan i dive mani time for the ball manag t...
3               my whole bodi feel itchi and like it on fire
4          nationwideclass no it not behav at all im mad ...
                                 ...                        
1599995    just woke up have no school is the best feel ever
1599996    thewdbcom veri cool to hear old walt interview...
1599997    are you readi for your mojo makeov ask me for ...
1599998    happi 38th birthday to my boo of alll time tup...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: Text, Length: 1600000, dtype: object

### Binary Encoding

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(binary = True)
features = vec.fit_transform(df['Text'])

In [16]:
import pandas as pd 
pd.DataFrame.sparse.from_spmatrix(features, columns=sorted(vec.vocabulary_.keys()))

Unnamed: 0,00,000,0000,00000,0000000000,00000001,0000001,0000014,000009260gb,0000abcd,...,úøùøû,úù,úùø,úùøªù,úùù,úû,ûø,ûøùøø³øªù,ûúøøù,ûúù
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1599995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1599996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1599997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1599998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
features = vec.fit_transform(df['Text'])

In [25]:
pd.DataFrame.sparse.from_spmatrix(features, columns=sorted(vec.vocabulary_.keys()))

Unnamed: 0,00,000,0000,00000,0000000000,00000001,0000001,0000014,000009260gb,0000abcd,...,úøùøû,úù,úùø,úùøªù,úùù,úû,ûø,ûøùøø³øªù,ûúøøù,ûúù
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1599995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1599996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1599997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1599998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Counting

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(binary = False)
features = vec.fit_transform(df['Text'])

In [27]:
pd.DataFrame.sparse.from_spmatrix(features, columns=sorted(vec.vocabulary_.keys()))

Unnamed: 0,00,000,0000,00000,0000000000,00000001,0000001,0000014,000009260gb,0000abcd,...,úøùøû,úù,úùø,úùøªù,úùù,úû,ûø,ûøùøø³øªù,ûúøøù,ûúù
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1599995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1599996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1599997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1599998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## navi bayes classifier

In [28]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features , df['Sentiment'], test_size=0.2)

In [29]:
print(x_train.shape)
print(x_test.shape)

(1280000, 791743)
(320000, 791743)


In [30]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB()

In [31]:
predicted = model.predict(x_test)
print(predicted)

[0 0 0 ... 0 0 0]


## Accuracy

In [32]:
print(np.mean(predicted == y_test))

0.7793375


## another way for accuracy

In [33]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test , predicted)

0.7793375

## confusion matrix

In [34]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predicted))

[[131060  29003]
 [ 41609 118328]]
