# Importing Basic Libraries

In [1]:
#importing libraries

import numpy as np
import pandas as pd

# Loading the dataset

In [2]:
#loading dataset into data frame

data = pd.read_csv("data/data.csv", encoding = 'latin-1', header=None)

In [3]:
data.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
data.tail()

Unnamed: 0,0,1,2,3,4,5
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [5]:
data.shape

(1600000, 6)

# Data Preprocessing

In [6]:
#shuffling the data frame so that model remains general and overfit less

from sklearn.utils import shuffle

data = shuffle(data)

In [7]:
#give column names to data

data.columns = ["sentiment", "id", "date_time", "query", "username", "tweet"]

In [8]:
#removing id, date_time, query and username from dataframe

data =data.drop(["id","date_time","query","username"],axis =1)

In [9]:
data.head()

Unnamed: 0,sentiment,tweet
46820,0,rejected.... again 2 supermarkets within a mo...
1385369,4,we did it. The show opened to an audience. The...
201022,0,@sweetflor really!? youre so lucky that theyre...
987956,4,@scotdoc You not working today?
1501045,4,"Homeeeee, just chillin! call me."


In [10]:
data.tail()

Unnamed: 0,sentiment,tweet
1340103,4,Just arrived at taekwondo tweet ya later x
133520,0,"@ComcastMelissa That it is. Permits, digging, ..."
1566012,4,@xxBeckeh so is alone with the sun rise.
561815,0,Exhausted
1482026,4,@TheBeautyChick3 makes you wonder of the filli...


In [11]:
# count the number of sentiments in the data set
# 4 stands for positive tweet
# 0 stands for negative tweet
 
data.sentiment.value_counts()

4    800000
0    800000
Name: sentiment, dtype: int64

In [12]:
# add a new cloumn tweet_len to store the length of each tweets

data["tweet_len"] = [len(t) for t in data.tweet]

In [13]:
# check for tweets greater than 140 characters

data[data.tweet_len>140].head(4)

Unnamed: 0,sentiment,tweet,tweet_len
519840,0,Just watching my videos from the Jonas Brother...,144
1308653,4,@roshnimo i think half-marathons fall into the...,145
1061801,4,@quickenonline Heh... a &quot;reconcile this t...,148
1097254,4,Reading &quot;Klokken i Makedonia&quot; by Knu...,144


In [14]:
import re

pattern1 = r'@[A-Za-z0-9_]+' # remove @ mention from tweets
pattern2 = r'https?://[^ ]+' #remove urls from tweets
combined_pattern = r'|'.join((pattern1,pattern2)) # joining pattern1 and pattern2
www_pattern = r'www.[^ ]+' #remove urls from tweets

In [15]:
#converting words like isn't to is not

negations_dic = {"isn't":"is not", "aren't":"are not","wasn't":"was not","weren't":"were not","haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not", "wouldn't":"would not","don't":"do not","doesn't":"does not","didn't":"did not","can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not","mustn't":"must not" }

In [16]:
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys())+r')\b')

In [17]:
#using scraping tools to extract required texts

from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

In [18]:
# defining a function to clean the tweets

def clean_tweets(tweet):
    soup = BeautifulSoup(tweet,'lxml')    # create and call an object of BeautifulSoup class
    souped = soup.get_text()  # get only texts from tweets
    # use exception handling as the data frame is not properly structured
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd","?") #remove utf-8-sig encoding
    except:
        bom_removed = souped
    stripped = re.sub(combined_pattern,' ',bom_removed) #calling combined_pattern
    stripped = re.sub(www_pattern,' ',stripped) #remove URLs
    lower_case = stripped.lower() #converting all tweets to lower case so that the compiler don't treat the same word in different cases as different
    neg_handled = neg_pattern.sub(lambda x:negations_dic[x.group()],lower_case) # converting words like isn't to is not
    letters_only = re.sub("[^a-zA-Z]"," ",neg_handled)
    words = [x for x in tok.tokenize(letters_only) if len(x)>1] #tokenize words whose length is greater than 1
    return (" ".join(words)).strip() # join the words and remove any whitespaces at the  beginning and end of the string
    

In [19]:
#clean_tweet_texts = [ ] #initialise an empty list 
#for i in range (0,1600000):
 #   clean_tweet_texts.append(clean_tweets(data['tweet'][i]))

In [20]:
nums = [0,400000,800000,1200000,1600000] #for batch processing
clean_tweet_texts = [ ] #initialise an empty list
for i in range (nums[0],nums[4]):
    clean_tweet_texts.append(clean_tweets(data['tweet'][i]))

  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


In [21]:
#print clean_tweet_texts list

clean_tweet_texts

['awww that bummer you shoulda got david carr of third day to do it',
 'is upset that he can not update his facebook by texting it and might cry as result school today also blah',
 'dived many times for the ball managed to save the rest go out of bounds',
 'my whole body feels itchy and like its on fire',
 'no it not behaving at all mad why am here because can not see you all over there',
 'not the whole crew',
 'need hug',
 'hey long time no see yes rains bit only bit lol fine thanks how you',
 'nope they did not have it',
 'que me muera',
 'spring break in plain city it snowing',
 'just re pierced my ears',
 'could not bear to watch it and thought the ua loss was embarrassing',
 'it it counts idk why did either you never talk to me anymore',
 'would ve been the first but did not have gun not really though zac snyder just doucheclown',
 'wish got to watch it with you miss you and how was the premiere',
 'hollis death scene will hurt me severely to watch on film wry is directors cut no

## Tokenization

In [22]:
#tokenize words in clean_tweet_texts

from nltk import word_tokenize

word_tokens = [ ] #initialise an empty list 
for word in clean_tweet_texts:
    word_tokens.append(word_tokenize(word))

In [23]:
#print word_tokens list

word_tokens

[['awww',
  'that',
  'bummer',
  'you',
  'shoulda',
  'got',
  'david',
  'carr',
  'of',
  'third',
  'day',
  'to',
  'do',
  'it'],
 ['is',
  'upset',
  'that',
  'he',
  'can',
  'not',
  'update',
  'his',
  'facebook',
  'by',
  'texting',
  'it',
  'and',
  'might',
  'cry',
  'as',
  'result',
  'school',
  'today',
  'also',
  'blah'],
 ['dived',
  'many',
  'times',
  'for',
  'the',
  'ball',
  'managed',
  'to',
  'save',
  'the',
  'rest',
  'go',
  'out',
  'of',
  'bounds'],
 ['my', 'whole', 'body', 'feels', 'itchy', 'and', 'like', 'its', 'on', 'fire'],
 ['no',
  'it',
  'not',
  'behaving',
  'at',
  'all',
  'mad',
  'why',
  'am',
  'here',
  'because',
  'can',
  'not',
  'see',
  'you',
  'all',
  'over',
  'there'],
 ['not', 'the', 'whole', 'crew'],
 ['need', 'hug'],
 ['hey',
  'long',
  'time',
  'no',
  'see',
  'yes',
  'rains',
  'bit',
  'only',
  'bit',
  'lol',
  'fine',
  'thanks',
  'how',
  'you'],
 ['nope', 'they', 'did', 'not', 'have', 'it'],
 ['que',

## Stop Words Removal

In [24]:
#stop words removal

from nltk.corpus import stopwords

stop = set(stopwords.words('english'))

clean_data = [ ]

for m in word_tokens:
    a = [w for w in m if not w in stop]
    clean_data.append(a)

In [25]:
# print clean_data

clean_data

[['awww', 'bummer', 'shoulda', 'got', 'david', 'carr', 'third', 'day'],
 ['upset',
  'update',
  'facebook',
  'texting',
  'might',
  'cry',
  'result',
  'school',
  'today',
  'also',
  'blah'],
 ['dived', 'many', 'times', 'ball', 'managed', 'save', 'rest', 'go', 'bounds'],
 ['whole', 'body', 'feels', 'itchy', 'like', 'fire'],
 ['behaving', 'mad', 'see'],
 ['whole', 'crew'],
 ['need', 'hug'],
 ['hey',
  'long',
  'time',
  'see',
  'yes',
  'rains',
  'bit',
  'bit',
  'lol',
  'fine',
  'thanks'],
 ['nope'],
 ['que', 'muera'],
 ['spring', 'break', 'plain', 'city', 'snowing'],
 ['pierced', 'ears'],
 ['could', 'bear', 'watch', 'thought', 'ua', 'loss', 'embarrassing'],
 ['counts', 'idk', 'either', 'never', 'talk', 'anymore'],
 ['would', 'first', 'gun', 'really', 'though', 'zac', 'snyder', 'doucheclown'],
 ['wish', 'got', 'watch', 'miss', 'premiere'],
 ['hollis',
  'death',
  'scene',
  'hurt',
  'severely',
  'watch',
  'film',
  'wry',
  'directors',
  'cut'],
 ['file', 'taxes'],
 ['

## Lemmatization

In [26]:
#lemmatization

from nltk.stem import WordNetLemmatizer

In [27]:
lemmatizer = WordNetLemmatizer() #create an object of WordNetLemmatizer class
data1 = [ ]
for l in word_tokens:
    b = [lemmatizer.lemmatize(q) for q in l]
    data1.append(b)

In [28]:
#print data1
data1

[['awww',
  'that',
  'bummer',
  'you',
  'shoulda',
  'got',
  'david',
  'carr',
  'of',
  'third',
  'day',
  'to',
  'do',
  'it'],
 ['is',
  'upset',
  'that',
  'he',
  'can',
  'not',
  'update',
  'his',
  'facebook',
  'by',
  'texting',
  'it',
  'and',
  'might',
  'cry',
  'a',
  'result',
  'school',
  'today',
  'also',
  'blah'],
 ['dived',
  'many',
  'time',
  'for',
  'the',
  'ball',
  'managed',
  'to',
  'save',
  'the',
  'rest',
  'go',
  'out',
  'of',
  'bound'],
 ['my', 'whole', 'body', 'feel', 'itchy', 'and', 'like', 'it', 'on', 'fire'],
 ['no',
  'it',
  'not',
  'behaving',
  'at',
  'all',
  'mad',
  'why',
  'am',
  'here',
  'because',
  'can',
  'not',
  'see',
  'you',
  'all',
  'over',
  'there'],
 ['not', 'the', 'whole', 'crew'],
 ['need', 'hug'],
 ['hey',
  'long',
  'time',
  'no',
  'see',
  'yes',
  'rain',
  'bit',
  'only',
  'bit',
  'lol',
  'fine',
  'thanks',
  'how',
  'you'],
 ['nope', 'they', 'did', 'not', 'have', 'it'],
 ['que', 'me',

In [29]:
#joining words after tokenization

clean_data1 = [ ]
for c in data1:
    a = " ".join(c)
    clean_data1.append(a)

In [30]:
#print clean_data1

clean_data1

['awww that bummer you shoulda got david carr of third day to do it',
 'is upset that he can not update his facebook by texting it and might cry a result school today also blah',
 'dived many time for the ball managed to save the rest go out of bound',
 'my whole body feel itchy and like it on fire',
 'no it not behaving at all mad why am here because can not see you all over there',
 'not the whole crew',
 'need hug',
 'hey long time no see yes rain bit only bit lol fine thanks how you',
 'nope they did not have it',
 'que me muera',
 'spring break in plain city it snowing',
 'just re pierced my ear',
 'could not bear to watch it and thought the ua loss wa embarrassing',
 'it it count idk why did either you never talk to me anymore',
 'would ve been the first but did not have gun not really though zac snyder just doucheclown',
 'wish got to watch it with you miss you and how wa the premiere',
 'hollis death scene will hurt me severely to watch on film wry is director cut not out now',

In [31]:
#constructing a dataframe of clean_data

clean_df = pd.DataFrame(clean_data1,columns=["tweet"])
clean_df["target"] = data.sentiment

In [32]:
clean_df.head()

Unnamed: 0,tweet,target
0,awww that bummer you shoulda got david carr of...,0
1,is upset that he can not update his facebook b...,0
2,dived many time for the ball managed to save t...,0
3,my whole body feel itchy and like it on fire,0
4,no it not behaving at all mad why am here beca...,0


In [33]:
clean_df.tail()

Unnamed: 0,tweet,target
1599995,just woke up having no school is the best feel...,4
1599996,thewdb com very cool to hear old walt interview,4
1599997,are you ready for your mojo makeover ask me fo...,4
1599998,happy th birthday to my boo of alll time tupac...,4
1599999,happy charitytuesday,4


In [34]:
#create a new column in the data frame to store the length of each tweet

clean_df['tweet_length']=[len(t) for t in clean_df.tweet]

In [35]:
clean_df

Unnamed: 0,tweet,target,tweet_length
0,awww that bummer you shoulda got david carr of...,0,65
1,is upset that he can not update his facebook b...,0,104
2,dived many time for the ball managed to save t...,0,69
3,my whole body feel itchy and like it on fire,0,44
4,no it not behaving at all mad why am here beca...,0,80
...,...,...,...
1599995,just woke up having no school is the best feel...,4,54
1599996,thewdb com very cool to hear old walt interview,4,47
1599997,are you ready for your mojo makeover ask me fo...,4,54
1599998,happy th birthday to my boo of alll time tupac...,4,59


In [36]:
clean_df.head()

Unnamed: 0,tweet,target,tweet_length
0,awww that bummer you shoulda got david carr of...,0,65
1,is upset that he can not update his facebook b...,0,104
2,dived many time for the ball managed to save t...,0,69
3,my whole body feel itchy and like it on fire,0,44
4,no it not behaving at all mad why am here beca...,0,80


In [37]:
clean_df[clean_df.tweet_length>140]

Unnamed: 0,tweet,target,tweet_length


In [38]:
#dividing features and targets in variables

X = clean_df.tweet
Y = clean_df.target
print(X.shape)
print(Y.shape)

(1600000,)
(1600000,)


# Train Test Split 

In [39]:
#train test split

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3,random_state=1)

# TF-IDF Vectorizer

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(analyzer = "word", ngram_range=(1,3)) #create Tfidf object and save it as vect

In [41]:
vect.fit(X_train) #fitting training tweets into vect
X_train_dtm = vect.transform(X_train) #transforming our training data

In [42]:
X_test_dtm = vect.transform(X_test) #transforming our test data

In [43]:
X_train_dtm

<1120000x8989489 sparse matrix of type '<class 'numpy.float64'>'
	with 35241599 stored elements in Compressed Sparse Row format>

In [44]:
import pickle
vect_file="vect.sav"
pickle.dump(vect,open(vect_file,'wb'))

# Naive Bayes

In [43]:
#naive bayes
#model building

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB(alpha=10)

In [44]:
nb.fit(X_train_dtm,Y_train)

MultinomialNB(alpha=10, class_prior=None, fit_prior=True)

In [45]:
from sklearn.model_selection import cross_val_score

In [46]:
#training accuracy
accuracies = cross_val_score(estimator = nb, X = X_train_dtm, y = Y_train, cv = 10) #K-fold cross validation with K=10


In [47]:
print("Mean accuracy of train data using Naive Bayes:",accuracies.mean()*100,"%")

Mean accuracy of train data using Naive Bayes: 79.43401800171665 %


In [48]:
#test set accuracy
y_pred_nb= nb.predict(X_test_dtm)

In [49]:
from sklearn import metrics
print("Test data accuracy using Naive Bayes: ",metrics.accuracy_score(Y_test,y_pred_nb)*100,"%")

Test data accuracy using Naive Bayes:  79.56354166666667 %


In [50]:
#confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test,y_pred_nb)

array([[207920,  32076],
       [ 66019, 173985]], dtype=int64)

# Logistic Regression

In [51]:
#logistic regression
#model building
from sklearn.linear_model import LogisticRegression
logisticRegression = LogisticRegression(C=1.1) #C is the cost parameter

In [52]:
#model fitting
logisticRegression.fit(X_train_dtm,Y_train)



LogisticRegression(C=1.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [53]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = logisticRegression, X = X_train_dtm, y = Y_train, cv = 10) #K-fold cross validation with K=10
print("Mean accuracy of train data using Logistic Regression:",accuracies.mean()*100,"%")



Mean accuracy of train data using Logistic Regression: 82.05214290940872 %


In [54]:
#test set accuracy
y_pred_lr = logisticRegression.predict(X_test_dtm)

In [55]:
from sklearn import metrics
print("Test data accuracy using Logistic Regression: ",metrics.accuracy_score(Y_test,y_pred_lr)*100,"%")

Test data accuracy using Logistic Regression:  82.341875 %


In [56]:
#confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test,y_pred_lr)

array([[200533,  39463],
       [ 45296, 194708]], dtype=int64)

# Support Vector Machine

In [57]:
#support vector machine
#model building
from sklearn.svm import LinearSVC
svm = LinearSVC(random_state=0)

In [58]:
#model fitting
svm.fit(X_train_dtm,Y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)

In [59]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = svm, X = X_train_dtm, y = Y_train, cv = 10) #K-fold cross validation with K=10
print("Mean accuracy of train data using Support Vector Machine:",accuracies.mean()*100,"%")



Mean accuracy of train data using Support Vector Machine: 82.24821446713855 %


In [60]:
#test set accuracy
y_pred_svm = svm.predict(X_test_dtm)

In [61]:
from sklearn import metrics
print("Test data accuracy using Support Vector: ",metrics.accuracy_score(Y_test,y_pred_svm)*100,"%")

Test data accuracy using Support Vector:  82.44687499999999 %


In [62]:
#confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test,y_pred_svm)

array([[200554,  39442],
       [ 44813, 195191]], dtype=int64)

# Pickle files

In [65]:
import pickle

### Naive Bayes

In [66]:
naive_bayes_file = 'nb_model.sav'

In [67]:
pickle.dump(nb,open(naive_bayes_file,'wb'))

### Logistic Regression

In [68]:
logistic_regression_file = 'lr_model.sav'

In [69]:
pickle.dump(logisticRegression,open(logistic_regression_file,'wb'))

### Support Vector Machine

In [70]:
support_vector_machine_file = 'svm_model.sav'

In [71]:
pickle.dump(svm,open(support_vector_machine_file,'wb'))

In [74]:
X_test[0]

'awww that bummer you shoulda got david carr of third day to do it'

In [91]:
logisticRegression.predict_proba(vect.transform(["aww that bummer you"]))

array([[0.9703955, 0.0296045]])

In [89]:
vect.transform(["hi i am harsh"])

<1x8989489 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [None]:
#  s = 'I am very sarcastic'
# s = clean_tweet(s)
# logisticRegression.predict_proba(vect.transform(s))

### testing