### Importing Necessary Libraries



In [2]:
import spacy
from spacy import displacy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import string

## Loading SpaCy's small english model

To get more details regarding SpaCy models check here : https://spacy.io/usage/models

In [3]:
# Loading Spacy small model as nlp
nlp = spacy.load("en_core_web_sm")

## Gathering all the Stop words which does not convey much meaning in the Sentiment

In [4]:
# Gathering all the stopwords
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)
print(len(stopwords))

326


In [5]:
# Loading yelp dataset
data_yelp = pd.read_csv('dataset/yelp_labelled.txt',
                        sep='\t', header= None)
data_yelp.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [6]:
# Adding column names to the dataframe
columnName = ['Review','Sentiment']
data_yelp.columns = columnName
data_yelp.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## So here we can deduce that Sentiment 1 is Positive and 0 is negative

In [7]:
print(data_yelp.shape)

(1000, 2)


In [16]:
# Adding Amazon dataset and adding its column name
data_amz = pd.read_csv("dataset/amazon_cells_labelled.txt",
                        sep='\t', header= None)
data_amz.columns = columnName
data_amz.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [17]:
print(data_amz.shape)

(1000, 2)


In [18]:
# Adding IMdB dataset and adding its column name
data_imdb = pd.read_csv("dataset/imdb_labelled.txt",
                        sep='\t', header= None)
data_imdb.columns = columnName
data_imdb.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [19]:
print(data_imdb.shape)

(748, 2)


## Appending all the Datasets

In [20]:
# Merging all the three dataframes
data = data_yelp.append([data_amz, data_imdb], ignore_index=True)
print(data.shape)

(2748, 2)


In [21]:
# Sentiment ditribution in the dataset
data.Sentiment.value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [22]:
# Getting information regarding the null entries in the dataset
data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [23]:
punct = string.punctuation
print(punct)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~




```
Here in the reviews we will find many stop words which do not add any meaning to the review.
Also punctuations will be encountered in the review which which will be considered as a seperate token by our model
So removing all the stop words and punctuation so that our model can train efficiently
```



In [24]:
def dataCleaning(sentence):
    doc = nlp(sentence)
    tokens = []
    for token in doc:
        if token.lemma_ != '-PRON-':
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    clean_tokens = []
    for token in tokens:
        if token not in punct and token not in stopwords:
            clean_tokens.append(token)
    return clean_tokens

## Here after passing a particular sentence in dataCleaning method we are returned with relevant words which contribute to the sentiments

In [25]:
dataCleaning("Today we are having heavy rainfall, We recommend you to stay at your home and be safe, Do not start running here and there")
# All the useful words are returned, no punctuations no stop words and in the lemmatized form

['today',
 'heavy',
 'rainfall',
 'recommend',
 'stay',
 'home',
 'safe',
 'start',
 'run']

In [26]:
# Spillting the train and test data
X = data['Review']
y = data['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
print(X_train.shape,y_test.shape)

(2198,) (550,)


## Preparing Model

In [37]:
# Creating the model and pipeline
tfidf = TfidfVectorizer(tokenizer = dataCleaning)
svm = LinearSVC()
steps = [('tfidf',tfidf),('svm',svm)]
pipe = Pipeline(steps)

In [38]:
# Training the model
pipe.fit(X_train,y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function dataCleaning at 0x000001FC13CAD430>)),
                ('svm', LinearSVC())])

In [39]:
# Testing on the test dataset
y_pred = pipe.predict(X_test)

In [40]:
# Printing the classification report and the confusion matrix
print(classification_report(y_test,y_pred))
print("\n\n")
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.74      0.78       301
           1       0.72      0.81      0.76       249

    accuracy                           0.77       550
   macro avg       0.77      0.77      0.77       550
weighted avg       0.78      0.77      0.77       550




[[223  78]
 [ 48 201]]


## Testing on the Random Manual Examples

**Here '1' represent that the input is positive sentiment**

In [31]:
# Testing on random inputs
pipe.predict(["Wow you are an amazing person"])

array([1], dtype=int64)

**Here '0' represent that input is negative sentiment**

In [32]:
pipe.predict(["you suck"])

array([0], dtype=int64)

### Footnotes
https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

https://towardsdatascience.com/a-simple-example-of-pipeline-in-machine-learning-with-scikit-learn-e726ffbb6976
