<a href="https://colab.research.google.com/github/krutarth45/Sentiment-Classification-using-SpaCy/blob/master/Sentiment_Classification_Using_SpaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing Necessary Libraries



In [0]:
import spacy
from spacy import displacy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.svm import LinearSVC
import string
# Importing Dataset from the Github Repository.
!git clone https://github.com/laxmimerit/NLP-Tutorial-8---Sentiment-Classification-using-SpaCy-for-IMDB-and-Amazon-Review-Dataset

Cloning into 'NLP-Tutorial-8---Sentiment-Classification-using-SpaCy-for-IMDB-and-Amazon-Review-Dataset'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 16 (delta 2), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (16/16), done.


In [0]:
# Loading Spacy small model as nlp
nlp = spacy.load("en_core_web_sm")

In [0]:
# Gathering all the stopwords
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)
print(len(stopwords))

326


In [0]:
# Loading yelp dataset
data_yelp = pd.read_csv("/content/NLP-Tutorial-8---Sentiment-Classification-using-SpaCy-for-IMDB-and-Amazon-Review-Dataset/datasets/yelp_labelled.txt",
                        sep='\t', header= None)
data_yelp.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [0]:
# Adding column names to the dataframe
columnName = ['Review','Sentiment']
data_yelp.columns = columnName
data_yelp.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [0]:
print(data_yelp.shape)

(1000, 2)


In [0]:
# Adding Amazon dataset and adding its column name
data_amz = pd.read_csv("/content/NLP-Tutorial-8---Sentiment-Classification-using-SpaCy-for-IMDB-and-Amazon-Review-Dataset/datasets/amazon_cells_labelled.txt",
                        sep='\t', header= None)
data_amz.columns = columnName
data_amz.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [0]:
print(data_amz.shape)

(1000, 2)


In [0]:
# Adding IMdB dataset and adding its column name
data_imdb = pd.read_csv("/content/NLP-Tutorial-8---Sentiment-Classification-using-SpaCy-for-IMDB-and-Amazon-Review-Dataset/datasets/imdb_labelled.txt",
                        sep='\t', header= None)
data_imdb.columns = columnName
data_imdb.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [0]:
print(data_imdb.shape)

(748, 2)


In [0]:
# Merging all the three dataframes
data = data_yelp.append([data_amz, data_imdb], ignore_index=True)
print(data.shape)

(2748, 2)


In [0]:
# Sentiment ditribution in the dataset
data.Sentiment.value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [0]:
# Getting information regarding the null entries in the dataset
data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [0]:
punct = string.punctuation
print(punct)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~




```
Here in the reviews we will find many stop words which do not add any meaning to the review.
Also punctuations will be encountered in the review which which will be considered as a seperate token by our model
So removing all the stop words and punctuation so that our model can train efficiently
```



In [0]:
def dataCleaning(sentence):
  doc = nlp(sentence)
  tokens = []
  for token in doc:
    if token.lemma_ != '-PRON-':
      temp = token.lemma_.lower().strip()
    else:
      temp = token.lower_
    tokens.append(temp)
  clean_tokens = []
  for token in tokens:
    if token not in punct and token not in stopwords:
      clean_tokens.append(token)
  return clean_tokens

In [0]:
dataCleaning("Today we are having heavy rainfall, We recommend you to stay at your home and be safe, Do not start running here and there")
# All the useful words are returned, no punctuations no stop words and in the lemmatized form

['today',
 'heavy',
 'rainfall',
 'recommend',
 'stay',
 'home',
 'safe',
 'start',
 'run']

In [0]:
# Spillting the train and test data
X = data['Review']
y = data['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
print(X_train.shape,y_test.shape)

(2198,) (550,)


In [0]:
# Creating the model and pipeline
tfidf = TfidfVectorizer(tokenizer = dataCleaning)
svm = LinearSVC()
steps = [('tfidf',tfidf),('svm',svm)]
pipe = Pipeline(steps)

In [0]:
# Training the model
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function dataCleaning at 0x7f1831926048>,
                                 use_idf=True, vocabulary=None)),
                ('svm',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True,

In [0]:
# Testing on the test dataset
y_pred = pipe.predict(X_test)

In [0]:
# Printing the classification report and the confusion matrix
print(classification_report(y_test,y_pred))
print("\n\n")
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.77      0.80       278
           1       0.78      0.83      0.80       272

    accuracy                           0.80       550
   macro avg       0.80      0.80      0.80       550
weighted avg       0.80      0.80      0.80       550




[[214  64]
 [ 46 226]]


In [0]:
# Testing on random inputs
pipe.predict(["Wow you are an amazing person"])

array([1])

In [0]:
pipe.predict(["you suck"])

array([0])

### Footnotes
https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

https://towardsdatascience.com/a-simple-example-of-pipeline-in-machine-learning-with-scikit-learn-e726ffbb6976
