### Importing Necessary Libraries



In [1]:
import spacy
from spacy import displacy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import string

## Loading SpaCy's small english model

To get more details regarding SpaCy models check here : https://spacy.io/usage/models

In [2]:
# Loading Spacy small model as nlp
nlp = spacy.load("en_core_web_sm")

## Gathering all the Stop words which does not convey much meaning in the Sentiment

In [3]:
# Gathering all the stopwords
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)
print(len(stopwords))

326


In [70]:
# Loading yelp dataset
data_yelp = pd.read_csv('dataset/all-data.csv',delimiter=',', encoding='latin-1', header=None)
data_yelp = data_yelp.rename(columns=lambda x: ['Sentiment', 'Sentence'][x])
data_yelp.head()

Unnamed: 0,Sentiment,Sentence
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [71]:
data_yelp['Sentiment'] = data_yelp['Sentiment'].map({'neutral':0,'positive':1,'negative':-1})

In [72]:
# data_yelp_small['Sentiment'] = [1,-1,-1,0,0,1,1,-1,0,-1]

In [73]:
# data_yelp_small.head()

In [74]:
data = data_yelp[['Sentence','Sentiment']]

In [75]:
data.head()

Unnamed: 0,Sentence,Sentiment
0,"According to Gran , the company has no plans t...",0
1,Technopolis plans to develop in stages an area...,0
2,The international electronic industry company ...,-1
3,With the new production plant the company woul...,1
4,According to the company 's updated strategy f...,1


In [76]:
# # Adding column names to the dataframe
# columnName = ['Review','Sentiment']
# data_yelp.columns = columnName
# data_yelp.head()

## So here we can deduce that Sentiment 1 is Positive and 0 is negative

In [77]:
print(data_yelp_small.shape)

(10, 5)


In [78]:
# # Adding Amazon dataset and adding its column name
# data_amz = pd.read_csv("dataset/amazon_cells_labelled.txt",
#                         sep='\t', header= None)
# data_amz.columns = columnName
# data_amz.head()

In [79]:
# print(data_amz.shape)

In [80]:
# # Adding IMdB dataset and adding its column name
# data_imdb = pd.read_csv("dataset/imdb_labelled.txt",
#                         sep='\t', header= None)
# data_imdb.columns = columnName
# data_imdb.head()

In [81]:
# print(data_imdb.shape)

## Appending all the Datasets

In [82]:
# # Merging all the three dataframes
# data = data_yelp.append([data_amz, data_imdb], ignore_index=True)
# print(data.shape)

In [83]:
# Sentiment ditribution in the dataset
data.Sentiment.value_counts()

 0    2879
 1    1363
-1     604
Name: Sentiment, dtype: int64

In [84]:
# Getting information regarding the null entries in the dataset
data.isnull().sum()

Sentence     0
Sentiment    0
dtype: int64

In [85]:
punct = string.punctuation
print(punct)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~




```
Here in the reviews we will find many stop words which do not add any meaning to the review.
Also punctuations will be encountered in the review which which will be considered as a seperate token by our model
So removing all the stop words and punctuation so that our model can train efficiently
```



In [86]:
def dataCleaning(sentence):
    doc = nlp(sentence)
    tokens = []
    for token in doc:
        if token.lemma_ != '-PRON-':
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    clean_tokens = []
    for token in tokens:
        if token not in punct and token not in stopwords:
            clean_tokens.append(token)
    return clean_tokens

## Here after passing a particular sentence in dataCleaning method we are returned with relevant words which contribute to the sentiments

In [87]:
dataCleaning("Today we are having heavy rainfall, We recommend you to stay at your home and be safe, Do not start running here and there")
# All the useful words are returned, no punctuations no stop words and in the lemmatized form

['today',
 'heavy',
 'rainfall',
 'recommend',
 'stay',
 'home',
 'safe',
 'start',
 'run']

In [88]:
# Spillting the train and test data
X = data['Sentence']
y = data['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
print(X_train.shape,y_test.shape)

(4361,) (485,)


## Preparing Model

In [89]:
# Creating the model and pipeline
tfidf = TfidfVectorizer(tokenizer = dataCleaning)
svm = RandomForestClassifier()
steps = [('tfidf',tfidf),('svm',svm)]
pipe = Pipeline(steps)

In [90]:
# Training the model
pipe.fit(X_train,y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function dataCleaning at 0x000001DDB2EF6040>)),
                ('svm', RandomForestClassifier())])

In [91]:
# Testing on the test dataset
y_pred = pipe.predict(X_test)

array([ 0,  1,  0,  0,  0,  1,  0,  0,  0,  1,  0,  0, -1,  1,  0,  0,  0,
        0,  0, -1,  1,  0,  0,  0,  0,  0,  1,  1,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  1,  0,  0,  0,  0,  0, -1,  0,  0,  1,  0,  1,  0,  0,
        0,  1,  0,  1, -1,  0,  1,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,
        0, -1,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        1,  0,  0,  0,  1,  0,  1,  0,  0,  1,  0,  0,  0,  0, -1,  0,  0,
        0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  1,  0,  0,  0, -1,  0,  0,
        0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  1,  0,  0,  1,
        1,  1,  0,  0,  1,  1,  0,  0, -1,  0,  0,  1,  1,  0,  0,  0,  0,
        1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,
        0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  1,  1,
        0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0, -1,  0,  0,  0,  0,  0,
        0,  0, -1,  0, -1,  0,  0,  0,  0,  0,  1,  1,  0,  0,  0, -1,  0,
        0,  0,  0,  0,  0

In [95]:
# y_test

In [96]:
accuracy_score(y_test, y_pred)

0.7463917525773196

In [97]:
# # Printing the classification report and the confusion matrix
# print(classification_report(y_test,y_pred))
# print("\n\n")
# print(confusion_matrix(y_test,y_pred))

## Testing on the Random Manual Examples

**Here '1' represent that the input is positive sentiment**

In [98]:
# Testing on random inputs
pipe.predict(["Wow you are an amazing person"])

array([0], dtype=int64)

**Here '0' represent that input is negative sentiment**

In [99]:
pipe.predict(["you suck"])

array([0], dtype=int64)

### Footnotes
https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

https://towardsdatascience.com/a-simple-example-of-pipeline-in-machine-learning-with-scikit-learn-e726ffbb6976
