In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv('/Users/hasangaranasinghe/Documents/Projects/Sentiment-Analysis/artifcats/sentiment_analysis.csv')

In [4]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


#  Data Preprocessing

In [5]:
data.shape

(7920, 3)

In [6]:
data.duplicated().sum()  # Check if the dataset have dublicated tuples

np.int64(0)

In [7]:
data.isnull().sum() # Check is there any missing values 

id       0
label    0
tweet    0
dtype: int64

# Test Preprocessing

In [8]:
import re
import string

convert uppercase to lowercase

In [9]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [10]:
data["tweet"].head()

0    #fingerprint #pregnancy test https://goo.gl/h1...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

Remove Links

In [11]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[r\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))

In [12]:
data["tweet"].head()

0    #fingerprint #pregnancy test  #android #apps #...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

Remove punctuations

In [13]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

data["tweet"] = data["tweet"].apply(remove_punctuations)

In [14]:
data["tweet"].head()

0    fingerprint pregnancy test  android apps beaut...
1    finally a transparant silicon case  thanks to ...
2    we love this would you go talk makememories un...
3    im wired i know im george i was made that way ...
4    what amazing service apple wont even talk to m...
Name: tweet, dtype: object

In [15]:
data["tweet"] = data['tweet'].str.replace('\d+', '', regex=True)

  data["tweet"] = data['tweet'].str.replace('\d+', '', regex=True)


In [16]:
data["tweet"].tail()

7915    live out loud lol liveoutloud selfie smile son...
7916    we would like to wish you an amazing day make ...
7917    helping my lovely  year old neighbor with her ...
7918    finally got my smart pocket wifi stay connecte...
7919    apple barcelona apple store bcn barcelona trav...
Name: tweet, dtype: object

Remove stopwords

In [17]:
!pip3 install nltk



In [18]:
import nltk

In [19]:
nltk.download('stopwords', download_dir='../static/model')

[nltk_data] Downloading package stopwords to ../static/model...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
with open('../static/model/corpora/stopwords/english', 'r') as file:
    sw = file.read().splitlines()

In [22]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [23]:
data["tweet"].head()

0    fingerprint pregnancy test android apps beauti...
1    finally transparant silicon case thanks uncle ...
2    love would go talk makememories unplug relax i...
3    im wired know im george made way iphone cute d...
4    amazing service apple wont even talk question ...
Name: tweet, dtype: object

Stemming

In [24]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [25]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))

In [26]:
data["tweet"].head()

0    fingerprint pregnanc test android app beauti c...
1    final transpar silicon case thank uncl yay son...
2    love would go talk makememori unplug relax iph...
3    im wire know im georg made way iphon cute dave...
4    amaz servic appl wont even talk question unles...
Name: tweet, dtype: object

In [27]:
data

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnanc test android app beauti c...
1,2,0,final transpar silicon case thank uncl yay son...
2,3,0,love would go talk makememori unplug relax iph...
3,4,0,im wire know im georg made way iphon cute dave...
4,5,1,amaz servic appl wont even talk question unles...
...,...,...,...
7915,7916,0,live loud lol liveoutloud selfi smile soni mus...
7916,7917,0,would like wish amaz day make everi minut coun...
7917,7918,0,help love year old neighbor ipad morn made rea...
7918,7919,0,final got smart pocket wifi stay connect anyti...


# Bulding Vacabulary 

In [28]:
from collections import Counter
vocab = Counter()

In [29]:
for sentence in data["tweet"]:
    vocab.update(sentence.split())

In [30]:
len(vocab)

15949

In [31]:
tokens = [key for key in vocab if vocab[key] > 10]

In [32]:
len(tokens)

1145

In [33]:
def save_voacabulary(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w', encoding='utf-8')
    file.write(data)
    file.close()

save_voacabulary(tokens, '../static/model/vocabulary.txt')

# Train-Test

In [34]:
X = data['tweet']
y = data['label']

In [35]:
!pip install scikit-learn



In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Vectorization

In [37]:
def vectorizer(ds, vocabulary):
    vectorized_lst = []

    for sentence in ds:
        sentence_lst = np.zeros(len(vocabulary))

        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_lst[i] = 1
    
        vectorized_lst.append(sentence_lst) 

    vectorized_lst_new = np.asanyarray(vectorized_lst, dtype=np.float32)

    return vectorized_lst_new


In [38]:
vectorized_x_train = vectorizer(X_train, tokens)

In [39]:
vectorized_x_test = vectorizer(X_test, tokens)

In [40]:
y_train.value_counts()

label
0    4748
1    1588
Name: count, dtype: int64

# Handle Imbalaced dataset

In [41]:
!pip install imbalanced-learn



In [42]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
vectorized_x_train_smote, y_train_smote = smote.fit_resample(vectorized_x_train, y_train)
print(vectorized_x_train_smote.shape, y_train_smote)

(9496, 1145) 0       0
1       0
2       0
3       1
4       0
       ..
9491    1
9492    1
9493    1
9494    1
9495    1
Name: label, Length: 9496, dtype: int64


In [43]:
y_train_smote.value_counts()

label
0    4748
1    4748
Name: count, dtype: int64

# Model Trainning and Evaluation 

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [45]:
model_params = {
    'logistic_regression' : {
        'model' : LogisticRegression(),
        'params' : {
            'C' : [0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000]
        }
    },
    'naive_bayes' : {
        'model' : MultinomialNB(),
        'params' : {
            'alpha': [0.1, 0.5, 1.0, 1e-2, 1e-3],
            'fit_prior': [True, False]
        }
    },
    'decision_tree' : {
        'model' : DecisionTreeClassifier(),
        'params' : {
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': [None, 'auto', 'sqrt', 'log2'],
            'max_leaf_nodes': [None, 10, 20, 30],
        }
    },
    'svm' : {
        'model' : SVC(),
        'params' : {
            'C': [0.1, 1, 10, 100, 1000],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1, 10],
        }
    }
}

In [143]:
from sklearn.model_selection import RandomizedSearchCV

scores = []

for model_name, mp in model_params.items():
    clf = RandomizedSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False, n_iter=20)
    clf.fit(vectorized_x_train_smote, y_train_smote)
    scores.append({
        'model' : model_name,
        'best_score' : clf.best_score_,
        'best_params' : clf.best_params_
    })

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [46]:
score_df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
score_df

NameError: name 'scores' is not defined

In [108]:
lr = LogisticRegression(C=0.2)
lr.fit(vectorized_x_train_smote, y_train_smote)

In [109]:
import pickle

with open('../static/model/model.pickle', 'wb') as file:
    pickle.dump(lr, file)

In [48]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def test_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Accuracy = {acc}\n\tPrecison = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

In [94]:
test_scores(y_test, lr.predict(vectorized_x_test))

Accuracy = 0.87
	Precison = 0.711
	Recall = 0.893
	F1-Score = 0.791
