In [6]:
# this program is a test implementation of twitter sentiment analysis

# importing important libraries

import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings
from nltk.stem.porter import * 
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline

In [7]:
# loading training and test data

train  = pd.read_csv('Dataset/train_tweets.csv')
test = pd.read_csv('Dataset/test_tweets.csv')

#combining training and and test data for preprocessing 
total_data = train.append(test, ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [8]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [9]:
#function to remove @word pattern from the tweets as they do not add any value

def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt

In [10]:
# removing @username patterns from tweets using remove_pattern function

print('\n\nRemoving  Twitter Handles \n\n')
total_data['tidy_tweet'] = np.vectorize(remove_pattern)(total_data['tweet'], "@[\w]*")
total_data.head()



Removing  Twitter Handles 




Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause th...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation,factsguide: society now #motivation


In [6]:
# words with small lenght i.e., words having length smaller than 3 hardly hold any sentiment
# hence it is better to remove such words

print('\n\nRemoving Short Words\n\n')
total_data['tidy_tweet'] = total_data['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
total_data.head()



Removing Short Words




Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when father dysfunctional selfish drags kids i...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks #lyft credit can't cause they don't off...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model love take with time urð±!!! ððð...
4,5,0.0,factsguide: society now #motivation,factsguide: society #motivation


In [11]:
# separating each word as a token

print('\n\nTweet Tokenization\n\n')
tokenized_tweet = total_data['tidy_tweet'].apply(lambda x: x.split())

'''
# removing punctuations like period, comma and semi-colon seems like a good idea but it is actually decreasing the accuracy 

#removing punctuations from each word if any
len(tokenized_tweet)
for i in range(len(tokenized_tweet)):
    for j in range(len(tokenized_tweet[i])):
        tokenized_tweet[i][j]=tokenized_tweet[i][j].replace('[.,;:]','')
        
'''        

tokenized_tweet.head()



Tweet Tokenization




0    [when, a, father, is, dysfunctional, and, is, ...
1    [thanks, for, #lyft, credit, i, can't, use, ca...
2                              [bihday, your, majesty]
3    [#model, i, love, u, take, with, u, all, the, ...
4             [factsguide:, society, now, #motivation]
Name: tidy_tweet, dtype: object

In [8]:
# stemming words i.e, words are play,playing,played are treated similarly

print('\n\nStemming\n\n')
stemmer = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])
tokenized_tweet.head()



Stemming




0    [when, father, dysfunct, selfish, drag, kid, i...
1    [thank, #lyft, credit, can't, caus, they, don'...
2                              [bihday, your, majesti]
3    [#model, love, take, with, time, urð±!!!, ð...
4                       [factsguide:, societi, #motiv]
Name: tidy_tweet, dtype: object

In [12]:
#stiching these tokens together

for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
total_data['tidy_tweet'] = tokenized_tweet
total_data.head()

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause they...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ur...
4,5,0.0,factsguide: society now #motivation,factsguide: society now #motivation


In [10]:
# checking out the features of the total_data dataframe

for col in total_data.columns:
	print(col)

id
label
tweet
tidy_tweet


In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/roshan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
# converting tidy_tweets column to numerical value using bag of words algorithm

'''
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=0.90,min_df=2,max_features=1000, stop_words='english')

# bag-of-words feature matrix

bow = bow_vectorizer.fit_transform(total_data['tidy_tweet'])

bow=bow.toarray()
print(bow.shape)

# get an idea of bow array as it is difficult to visualise in normal form
#hence converting it to a numpy array
'''
# implementing bag of words

word2count = {}
wordlist=[]
for data in total_data['tidy_tweet'].values: 
    words = nltk.word_tokenize(data)
    wordlist.append(words)
    for word in words: 
        if word not in word2count.keys(): 
            word2count[word] = 1
        else: 
            word2count[word] += 1
            

In [16]:
import heapq 
freq_words = heapq.nlargest(1000, word2count, key=word2count.get)

bow = [] 
for data,lword in zip(total_data['tidy_tweet'].values,wordlist): 
    vector = [] 
    for word in freq_words: 
        if word in lword: 
            vector.append(1) 
        else: 
            vector.append(0) 
    bow.append(vector) 
bow = np.asarray(bow)

In [14]:
print(bow.shape)

(49159, 1000)


In [17]:
# import classes for logistic regression

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [18]:
# splitting combined data

train_bow = bow[:31962,:]
test_bow = bow[31962:,:]

In [19]:
# splitting data into training and validation set
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['label'], random_state=42, test_size=0.3)

In [20]:
# training and predicting the results

lreg = LogisticRegression() # creating an object of Logistic regression class
lreg.fit(xtrain_bow, ytrain) # fitting on training data

prediction = lreg.predict_proba(xvalid_bow) # predicting on the validation set
prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction_int.astype(np.int)

print(f1_score(yvalid, prediction_int)) # calculating f1 score

test_pred = lreg.predict_proba(test_bow)
test_pred_int = test_pred[:,1] >= 0.3
test_pred_int = test_pred_int.astype(np.int)
test['label'] = test_pred_int
submission = test[['id','label']]
submission.to_csv('sub_lreg_bow.csv', index=False) # writing data to a CSV file



0.5650438946528332
