In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix

In [2]:

column_names = ['review', 'sentiment']

In [3]:
am = pd.read_csv("/content/drive/My Drive/amazon_yelp_imdb/amazon_cells_labelled.txt",sep = '\t', header = None)
im = pd.read_csv("/content/drive/My Drive/amazon_yelp_imdb/imdb_labelled.txt",sep = '\t', header = None)
yp = pd.read_csv("/content/drive/My Drive/amazon_yelp_imdb/yelp_labelled.txt",sep = '\t', header = None)

In [4]:
am.columns = column_names
im.columns = column_names
yp.columns = column_names

In [5]:
data = am.append([im, yp], ignore_index=True)

In [6]:
data.shape

(2748, 2)

In [7]:
data.head()

Unnamed: 0,review,sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [8]:
import unicodedata

In [9]:
#Removal of accented text
def remove_accented_chars(x):
  removed=[]
  x = unicodedata.normalize('NFKD',x).encode('ascii','ignore').decode('utf-8','ignore')
  removed.append(x)

  return " ".join(removed)

In [10]:
data['review'] = data['review'].apply(remove_accented_chars)

In [11]:
!pip install contractions

Collecting contractions
  Downloading https://files.pythonhosted.org/packages/00/92/a05b76a692ac08d470ae5c23873cf1c9a041532f1ee065e74b374f218306/contractions-0.0.25-py2.py3-none-any.whl
Collecting textsearch
  Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl
Collecting Unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 3.6MB/s 
[?25hCollecting pyahocorasick
[?25l  Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)
[K     |████████████████████████████████| 317kB 17.6MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  

In [12]:
import contractions

In [13]:
#Contractions to expansions
data['review'] = data['review'].apply(lambda x: ' '.join([contractions.fix(t) for t in x.split()]))

In [14]:
import re

In [15]:
def remove_links(text):
    text = re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?','',text)
    return text

In [16]:
#Removal of http links
data['review'] = data['review'].apply(lambda x:remove_links(x))

In [17]:
!pip install beautifulsoup4



In [18]:
from bs4 import BeautifulSoup

In [19]:
#Removal of Html tags
data['review'] = data['review'].apply(lambda x: BeautifulSoup(x,'lxml').get_text())

In [20]:
#removal of special characters, numbers,white spaces
data['review'] = data['review'].apply(lambda x: ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",str(x)).split()))

In [21]:
data.head()

Unnamed: 0,review,sentiment
0,So there is no way for me to plug it in here i...,0
1,Good case Excellent value,1
2,Great for the jawbone,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great,1


In [22]:
#!pip install spacy

In [23]:
import spacy

In [24]:
#!python -m spacy download en_core_web_lg

In [25]:
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)

In [26]:
nlp = spacy.load('en_core_web_lg')

In [27]:
sent = nlp.create_pipe('sentencizer')

In [28]:
nlp.add_pipe(sent, before='parser')

In [29]:
import string

In [30]:
punc = string.punctuation
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [31]:
def text_data_preprocess(sentence):
    doc = nlp(sentence)
    token = "  ".join(str(doc).split())
    cleaned_tokens = []
    if token not in stopwords and token not in punc:
        cleaned_tokens.append(token.lower())
    return "  ".join(cleaned_tokens)

In [32]:
#Removal of stopwords,punctuations
data['head'] = data['review'].apply(text_data_preprocess)

In [33]:
data.head()

Unnamed: 0,review,sentiment,head
0,So there is no way for me to plug it in here i...,0,so there is no way for me to plug it ...
1,Good case Excellent value,1,good case excellent value
2,Great for the jawbone,1,great for the jawbone
3,Tied to charger for conversations lasting more...,0,tied to charger for conversations lasting...
4,The mic is great,1,the mic is great


In [None]:
#data.to_csv('cleaned_amzn_ylp_imdb_data.csv',index=False)

In [2]:
#data = pd.read_csv('/content/drive/My Drive/amazon_yelp_imdb/cleaned_amzn_ylp_imdb_data.csv')

In [3]:
#data.head()

Unnamed: 0,review,sentiment,head
0,So there is no way for me to plug it in here i...,0,so there is no way for me to plug it ...
1,Good case Excellent value,1,good case excellent value
2,Great for the jawbone,1,great for the jawbone
3,Tied to charger for conversations lasting more...,0,tied to charger for conversations lasting...
4,The mic is great,1,the mic is great


In [34]:
X = data['head']
y = data['sentiment']

In [35]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,shuffle=True)

In [36]:
skf = StratifiedKFold(n_splits=20, shuffle=True, random_state=None)

In [37]:
count = CountVectorizer()
tfidf = TfidfVectorizer()

In [38]:
classifier = SVC(probability=True, kernel="linear", class_weight="balanced")


In [39]:
param_grid = {'svc__C': [0.01, 0.1, 1,10,100],'svc__gamma': [0.001, 0.01, 1]}


In [40]:
np.random.seed(1)

pipeline_svm = make_pipeline(count, classifier)

grid_svm = GridSearchCV(pipeline_svm,
                    param_grid = param_grid, 
                    cv = 10,
                    scoring="f1",
                    verbose=1,n_jobs=-1) 

grid_svm.fit(X_train, y_train)

Fitting 10 folds for each of 15 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   58.0s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  3.5min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('countvectorizer',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                

In [41]:
prediction = grid_svm.predict(X_test)

In [42]:
score = accuracy_score(prediction,y_test)
print(score)

0.8303030303030303


In [43]:
grid_svm.best_params_

{'svc__C': 1, 'svc__gamma': 0.001}

In [44]:

pipeline_svm = make_pipeline(count, SVC(C=1,gamma=0.001,probability=True, kernel="linear", class_weight="balanced"))


In [45]:
accuracy = []
for train_index, val_index in skf.split(X,y): 
    
    X1_train, X1_test = X.iloc[train_index], X.iloc[val_index] 
    y1_train, y1_test = y.iloc[train_index], y.iloc[val_index]

    pipeline_svm.fit(X1_train,y1_train)
    pred = pipeline_svm.predict(X1_test)
    score = accuracy_score(pred,y1_test)
    accuracy.append(score)

import numpy as np
acc_np = np.array(accuracy)
print(acc_np.mean())

0.8278959060615678


In [None]:
import pickle

In [None]:
#saving a ML model
with open('model_svm_amzn_ylp_imdb.pkl','wb')as f:
  pickle.dump(grid_svm,f)