In [57]:
# !pip install -U spacy

In [58]:
#!python -m spacy download en

In [88]:
#!python -m spacy download en_core_web_sm

In [1]:
import spacy

In [2]:
import pandas as pd
data_yelp = pd.read_csv('yelp_labelled.txt', sep='\t',header = None)
data_yelp.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
data_yelp.columns = ['Review','Sentiment']
data_yelp.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
data_yelp.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [5]:
data_yelp['Sentiment'].value_counts()

1    500
0    500
Name: Sentiment, dtype: int64

In [6]:
nlp = spacy.load('en_core_web_sm')

In [7]:
sent = nlp.create_pipe('sentencizer')

In [8]:
nlp.add_pipe(sent, before='parser')

In [9]:
from spacy.lang.en.stop_words import STOP_WORDS

In [10]:
stopwords = list(STOP_WORDS)

In [11]:
import string

In [12]:
punc = string.punctuation
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

#### This below function will clean the text data by removing stopwords, punctuations, white spaces with lowercasing in its lemmas

In [13]:
def text_data_preprocess(sentence):
    doc = nlp(sentence)
    
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punc:
            cleaned_tokens.append(token)
    return cleaned_tokens
            

In [16]:
data_yelp['head'] = data_yelp['Review'].head().apply(text_data_preprocess)

In [17]:
data_yelp.head()

Unnamed: 0,Review,Sentiment,head
0,Wow... Loved this place.,1,"[wow, ..., love, place]"
1,Crust is not good.,0,"[crust, good]"
2,Not tasty and the texture was just nasty.,0,"[tasty, texture, nasty]"
3,Stopped by during the late May bank holiday of...,1,"[stop, late, bank, holiday, rick, steve, recom..."
4,The selection on the menu was great and so wer...,1,"[selection, menu, great, price]"


In [18]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [19]:
tfidf = TfidfVectorizer(tokenizer = text_data_preprocess)
classifier = LinearSVC()

In [20]:
X = data_yelp['Review']
y = data_yelp['Sentiment']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [22]:
clf = Pipeline([('tfidf', tfidf),('classifier', classifier)])

In [23]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [79]:
y_pred = clf.predict(X_test)

In [80]:
from sklearn.metrics import classification_report, confusion_matrix

In [81]:
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.84      0.81      0.82       104
           1       0.80      0.83      0.82        96

   micro avg       0.82      0.82      0.82       200
   macro avg       0.82      0.82      0.82       200
weighted avg       0.82      0.82      0.82       200



In [82]:
confusion_matrix(y_test,y_pred)

array([[84, 20],
       [16, 80]])

In [83]:
clf.predict(["I didn't liked the food"])

array([0])

In [84]:
clf.predict(['The price was good'])

array([1])

In [85]:
clf.predict(['good place to eat'])

array([1])

In [86]:
from sklearn.metrics import accuracy_score

In [87]:
accuracy_score(y_test,y_pred)

0.82