## Preprocessing the data



In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data=pd.read_csv('/content/sentiment_analysis.csv')
data.drop(['id'],axis=1,inplace=True)
data

In [9]:
import re
import string
data["tweet"] = data["tweet"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [10]:
data["tweet"] = data['tweet'].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))

In [11]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

data["tweet"] = data["tweet"].apply(remove_punctuations)
data["tweet"].tail(10)

Unnamed: 0,tweet
7910,perfect match instagood applewatch red instagr...
7911,i am completely in love with the new iphone em...
7912,tune in turn on drop out gtd in one app mobi...
7913,ok so my galaxy crashed after one day now i ha...
7914,gain followers rt this must follow me i follow...
7915,live out loud lol liveoutloud selfie smile son...
7916,we would like to wish you an amazing day make ...
7917,helping my lovely 90 year old neighbor with he...
7918,finally got my smart pocket wifi stay connecte...
7919,apple barcelona apple store bcn barcelona trav...


In [12]:
import nltk
nltk.download('stopwords', download_dir='../static/model')

[nltk_data] Downloading package stopwords to ../static/model...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
with open('../static/model/corpora/stopwords/english', 'r') as file:
    sw = file.read()
data["tweet"] = data["tweet"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
data["tweet"].tail()

Unnamed: 0,tweet
7915,live loud lol liveoutloud selfie smile sony mu...
7916,like wish amazing day make every minute count ...
7917,helping lovely 90 year old neighbor ipad morni...
7918,finally got smart pocket wifi stay connected a...
7919,apple barcelona apple store bcn barcelona trav...


In [14]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
data["tweet"] = data["tweet"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))
data["tweet"].head()

Unnamed: 0,tweet
0,fingerprint pregnanc test android app beauti c...
1,final transpar silicon case thank uncl yay son...
2,love go talk makememori unplug relax iphon sma...
3,wire know georg made way iphon cute daventri home
4,amaz servic appl wont even talk question unles...


## Train test split followed by conversion of text data

In [39]:
X = data['tweet']
y = data['label']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectoriser=TfidfVectorizer()
X_train_lower=vectoriser.fit_transform(X_train_lower)
X_test=vectoriser.transform(X_test)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [36]:
from collections import Counter
vocab = Counter()
for sentence in data['tweet']:
    vocab.update(sentence.split())
tokens = [key for key in vocab if vocab[key] > 10]

In [37]:
def save_vocabulary(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w', encoding="utf-8")
    file.write(data)
    file.close()

save_vocabulary(tokens, '../static/model/vocabulary.txt')

In [40]:
def vectorizer(ds, vocabulary):
    vectorized_lst = []

    for sentence in ds:
        sentence_lst = np.zeros(len(vocabulary))

        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_lst[i] = 1

        vectorized_lst.append(sentence_lst)

    vectorized_lst_new = np.asarray(vectorized_lst, dtype=np.float32)

    return vectorized_lst_new
vectorized_x_train = vectorizer(X_train, tokens)
vectorized_x_test = vectorizer(X_test, tokens)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

## Oversampling

In [41]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
vectorized_x_train_smote, y_train_smote = smote.fit_resample(vectorized_x_train, y_train)
print(vectorized_x_train_smote.shape, y_train_smote.shape)



(9430, 1160) (9430,)


## Model Testing

In [44]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    print(f'Training Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n')

def validation_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    print(f'Testing Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n')

In [45]:
lr = LogisticRegression()
lr.fit(vectorized_x_train_smote, y_train_smote)

y_train_pred = lr.predict(vectorized_x_train_smote)

y_test_pred = lr.predict(vectorized_x_test)

training_scores(y_train_smote, y_train_pred)

validation_scores(y_test, y_test_pred)

Training Scores:
	Accuracy = 0.94
	Precision = 0.914
	Recall = 0.971

Testing Scores:
	Accuracy = 0.873
	Precision = 0.709
	Recall = 0.854



## Saving model

In [47]:
import pickle

with open('/content/model.pickle', 'wb') as file:
    pickle.dump(lr, file)