###Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
import warnings
%matplotlib inline

warnings.filterwarnings('ignore')

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Sentiment Analysis/test_data_100.csv', encoding='unicode_escape')

In [None]:
dataset.head(20)

Unnamed: 0,id,tweet
0,7922.0,currently shitting my fucking pants. #apple #i...
1,7923.0,"I'd like to puts some CD-ROMS on my iPad, is t..."
2,7924.0,My ipod is officially dead. I lost all my pict...
3,7925.0,Been fighting iTunes all night! I only want th...
4,7926.0,#Repost @getbakednfried with repostapp ··· to ...
5,7927.0,This new apple software update is really doing...
6,7928.0,BABY #iPhone #iphone6s #gold #new #apple #appl...
7,7929.0,I'm confused...why did I have to take the time...
8,7930.0,Fruit just tastes better when you pick it your...
9,7931.0,Con mi buddy#edgar #buddy #friend #viviendo #t...


###Data Cleaning

In [None]:
def remove_pattern(input_txt, pattern):
  r = re.findall(pattern, input_txt)
  for word in r:
    input_txt = re.sub(word, "", input_txt)
  return input_txt

In [None]:
dataset['clean_tweet'] = np.vectorize(remove_pattern)(dataset['tweet'], "@[\w]*")

In [None]:
dataset['clean_tweet'] = dataset['clean_tweet'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')

In [None]:
dataset.head(20)

Unnamed: 0,id,tweet,clean_tweet
0,7922.0,currently shitting my fucking pants. #apple #i...,currently shitting my fucking pants. #apple #i...
1,7923.0,"I'd like to puts some CD-ROMS on my iPad, is t...","I'd like to puts some CD-ROMS on my iPad, is t..."
2,7924.0,My ipod is officially dead. I lost all my pict...,My ipod is officially dead. I lost all my pict...
3,7925.0,Been fighting iTunes all night! I only want th...,Been fighting iTunes all night! I only want th...
4,7926.0,#Repost @getbakednfried with repostapp ··· to ...,#Repost with repostapp ··· to announce we wil...
5,7927.0,This new apple software update is really doing...,This new apple software update is really doing...
6,7928.0,BABY #iPhone #iphone6s #gold #new #apple #appl...,BABY #iPhone #iphone6s #gold #new #apple #appl...
7,7929.0,I'm confused...why did I have to take the time...,I'm confused...why did I have to take the time...
8,7930.0,Fruit just tastes better when you pick it your...,Fruit just tastes better when you pick it your...
9,7931.0,Con mi buddy#edgar #buddy #friend #viviendo #t...,Con mi buddy#edgar #buddy #friend #viviendo #t...


In [None]:
# remove special characters, numbers and punctuations
dataset['clean_tweet'] = dataset['clean_tweet'].str.replace("[^a-zA-Z#]", " ")
dataset.head()

Unnamed: 0,id,tweet,clean_tweet
0,7922.0,currently shitting my fucking pants. #apple #i...,currently shitting my fucking pants #apple #i...
1,7923.0,"I'd like to puts some CD-ROMS on my iPad, is t...",I d like to puts some CD ROMS on my iPad is t...
2,7924.0,My ipod is officially dead. I lost all my pict...,My ipod is officially dead I lost all my pict...
3,7925.0,Been fighting iTunes all night! I only want th...,Been fighting iTunes all night I only want th...
4,7926.0,#Repost @getbakednfried with repostapp ··· to ...,#Repost with repostapp to announce we wil...


In [None]:
# remove short words
dataset['clean_tweet'] = dataset['clean_tweet'].apply(lambda x: " ".join([w for w in x.split() if len(w)>3]))
dataset.head()

Unnamed: 0,id,tweet,clean_tweet
0,7922.0,currently shitting my fucking pants. #apple #i...,currently shitting fucking pants #apple #iMac ...
1,7923.0,"I'd like to puts some CD-ROMS on my iPad, is t...",like puts some ROMS iPad that possible wouldn ...
2,7924.0,My ipod is officially dead. I lost all my pict...,ipod officially dead lost pictures videos from...
3,7925.0,Been fighting iTunes all night! I only want th...,Been fighting iTunes night only want music paid
4,7926.0,#Repost @getbakednfried with repostapp ··· to ...,#Repost with repostapp announce will have #app...


###Tokenization of clean tweets

Individual words separated as tokens to facilitate further processing as strings



In [None]:
# individual words considered as tokens
tokenized_tweet = dataset['clean_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

0    [currently, shitting, fucking, pants, #apple, ...
1    [like, puts, some, ROMS, iPad, that, possible,...
2    [ipod, officially, dead, lost, pictures, video...
3    [Been, fighting, iTunes, night, only, want, mu...
4    [#Repost, with, repostapp, announce, will, hav...
Name: clean_tweet, dtype: object

###Stemming

Stemmer.stem() converts certain words into its simpler version.

In [None]:
# stem the words
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])
tokenized_tweet.head()


0    [current, shit, fuck, pant, #appl, #imac, #cas...
1    [like, put, some, rom, ipad, that, possibl, wo...
2    [ipod, offici, dead, lost, pictur, video, from...
3    [been, fight, itun, night, onli, want, music, ...
4    [#repost, with, repostapp, announc, will, have...
Name: clean_tweet, dtype: object

In [None]:
# combine words into single sentence
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = " ".join(tokenized_tweet[i])
    
dataset['clean_tweet'] = tokenized_tweet
dataset.head()

Unnamed: 0,id,tweet,clean_tweet
0,7922.0,currently shitting my fucking pants. #apple #i...,current shit fuck pant #appl #imac #cashmoney ...
1,7923.0,"I'd like to puts some CD-ROMS on my iPad, is t...",like put some rom ipad that possibl wouldn tha...
2,7924.0,My ipod is officially dead. I lost all my pict...,ipod offici dead lost pictur video from concer...
3,7925.0,Been fighting iTunes all night! I only want th...,been fight itun night onli want music paid
4,7926.0,#Repost @getbakednfried with repostapp ··· to ...,#repost with repostapp announc will have #appl...


###Data Transformation

In [None]:
# Loading BoW dictionary
from sklearn.feature_extraction.text import CountVectorizer
import pickle
cvFile='/content/drive/MyDrive/Colab Notebooks/Sentiment Analysis/BoW_Sentiment_Model.pkl'
# cv = CountVectorizer(decode_error="replace", vocabulary=pickle.load(open('./drive/MyDrive/Colab Notebooks/2 Sentiment Analysis (Basic)/3.1 BoW_Sentiment Model.pkl', "rb")))
cv = pickle.load(open(cvFile, "rb"))


In [None]:
X_fresh = cv.fit_transform(dataset['clean_tweet']).toarray()
X_fresh.shape

(929, 958)

In [None]:
X_fresh

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

###Predictions (via sentiment model)

In [None]:
import joblib
model = joblib.load('/content/drive/MyDrive/Colab Notebooks/Sentiment Analysis/Sentiment_Model')

In [None]:
y_pred = model.predict(X_fresh)
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 1 1 0 1 1 0 0 0
 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0
 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1
 0 0 0 1 0 0 1 0 1 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 1 0 0 1 0 1 0 1 0 0 0 

In [None]:
dataset['predicted_label'] = y_pred.tolist()
dataset.head(20)

Unnamed: 0,id,tweet,clean_tweet,predicted_label
0,7922.0,currently shitting my fucking pants. #apple #i...,current shit fuck pant #appl #imac #cashmoney ...,0
1,7923.0,"I'd like to puts some CD-ROMS on my iPad, is t...",like put some rom ipad that possibl wouldn tha...,0
2,7924.0,My ipod is officially dead. I lost all my pict...,ipod offici dead lost pictur video from concer...,0
3,7925.0,Been fighting iTunes all night! I only want th...,been fight itun night onli want music paid,0
4,7926.0,#Repost @getbakednfried with repostapp ··· to ...,#repost with repostapp announc will have #appl...,0
5,7927.0,This new apple software update is really doing...,thi appl softwar updat realli do thing phone t...,0
6,7928.0,BABY #iPhone #iphone6s #gold #new #apple #appl...,babi #iphon #iphon #gold #new #appl #appleisbe...,0
7,7929.0,I'm confused...why did I have to take the time...,confus have take time appoint still wait minut...,0
8,7930.0,Fruit just tastes better when you pick it your...,fruit just tast better when pick yourself #app...,0
9,7931.0,Con mi buddy#edgar #buddy #friend #viviendo #t...,buddy#edgar #buddi #friend #viviendo #taller #...,0


In [None]:
dataset.head()

Unnamed: 0,id,tweet,clean_tweet,predicted_label
0,7922.0,currently shitting my fucking pants. #apple #i...,current shit fuck pant #appl #imac #cashmoney ...,0
1,7923.0,"I'd like to puts some CD-ROMS on my iPad, is t...",like put some rom ipad that possibl wouldn tha...,0
2,7924.0,My ipod is officially dead. I lost all my pict...,ipod offici dead lost pictur video from concer...,0
3,7925.0,Been fighting iTunes all night! I only want th...,been fight itun night onli want music paid,0
4,7926.0,#Repost @getbakednfried with repostapp ··· to ...,#repost with repostapp announc will have #appl...,0


In [None]:
def label_sentiment(row):
  if row['predicted_label']==0:
    return 'positive emotion'
  else:
    return 'negative emotion'

#dataset['sentiment_predicted'] = dataset.apply(lambda row: label_sentiment(row), axis=1)

#df['race_label'] = df.apply (lambda row: label_race(row), axis=1)


In [None]:
dataset['sentiment_predicted'] = dataset.apply(lambda row: label_sentiment(row), axis=1)

In [None]:
dataset.head(50)

Unnamed: 0,id,tweet,clean_tweet,predicted_label,sentiment_predicted
0,7922.0,currently shitting my fucking pants. #apple #i...,current shit fuck pant #appl #imac #cashmoney ...,0,positive emotion
1,7923.0,"I'd like to puts some CD-ROMS on my iPad, is t...",like put some rom ipad that possibl wouldn tha...,0,positive emotion
2,7924.0,My ipod is officially dead. I lost all my pict...,ipod offici dead lost pictur video from concer...,0,positive emotion
3,7925.0,Been fighting iTunes all night! I only want th...,been fight itun night onli want music paid,0,positive emotion
4,7926.0,#Repost @getbakednfried with repostapp ··· to ...,#repost with repostapp announc will have #appl...,0,positive emotion
5,7927.0,This new apple software update is really doing...,thi appl softwar updat realli do thing phone t...,0,positive emotion
6,7928.0,BABY #iPhone #iphone6s #gold #new #apple #appl...,babi #iphon #iphon #gold #new #appl #appleisbe...,0,positive emotion
7,7929.0,I'm confused...why did I have to take the time...,confus have take time appoint still wait minut...,0,positive emotion
8,7930.0,Fruit just tastes better when you pick it your...,fruit just tast better when pick yourself #app...,0,positive emotion
9,7931.0,Con mi buddy#edgar #buddy #friend #viviendo #t...,buddy#edgar #buddi #friend #viviendo #taller #...,0,positive emotion


In [None]:
dataset.to_csv(r'/content/drive/MyDrive/Colab Notebooks/Sentiment Analysis/predicted_data.csv',index=False)