In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
reg_token=RegexpTokenizer('[a-zA-Z]+')
sw=set(stopwords.words('english'))
wnl=WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import csv
data=pd.read_csv('IMDB Dataset.csv',engine='c',error_bad_lines=False)

In [None]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [None]:
from nltk.corpus import wordnet
def simple_pos(p):
  if p.startswith('J'):
    return wordnet.ADJ
  elif p.startswith('V'):
    return wordnet.VERB
  elif p.startswith('N'):
    return wordnet.NOUN
  elif p.startswith('R'):
    return wordnet.ADV
  else:
    return wordnet.NOUN 

In [None]:
def clean_data(k):
  d=reg_token.tokenize(k)
  cleaned_words=[]
  for w in d:
    if w.lower() not in sw:
      p=pos_tag([w])
      word=wnl.lemmatize(w,pos=simple_pos(p[0][1]))
      cleaned_words.append(word.lower())

  return " ".join(cleaned_words)

In [None]:
data['review']=data['review'].apply(clean_data)

In [None]:
check=data['review'].isna()
check.describe()

count     50000
unique        1
top       False
freq      50000
Name: review, dtype: object

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [None]:
y=data.iloc[:,-1]
x=data.iloc[:,-2]

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=2)

In [None]:
from sklearn.svm import LinearSVC

In [None]:
pip=Pipeline([('tv',TfidfVectorizer(lowercase=False)),('ls',LinearSVC())])
para=[{'tv__max_df':[0.1,0.2,0.3],'tv__binary':[True,False],'tv__ngram_range':[(1,1),(1,2)],'ls__dual':[True,False],'ls__C':[0.1,0.3,0.6,1],'ls__random_state':[42] }]

In [None]:
gs_model=GridSearchCV(pip,param_grid=para)
gs_model.fit(x_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('tv', TfidfVectorizer(lowercase=False)),
                                       ('ls', LinearSVC())]),
             param_grid=[{'ls__C': [0.1, 0.3, 0.6, 1],
                          'ls__dual': [True, False], 'ls__random_state': [42],
                          'tv__binary': [True, False],
                          'tv__max_df': [0.1, 0.2, 0.3],
                          'tv__ngram_range': [(1, 1), (1, 2)]}])

In [None]:
gs_model.best_params_

{'ls__C': 1,
 'ls__dual': True,
 'ls__random_state': 42,
 'tv__binary': False,
 'tv__max_df': 0.2,
 'tv__ngram_range': (1, 2)}

In [None]:
vectorizer=TfidfVectorizer(lowercase=False,max_df=0.2,ngram_range=(1,2))
vectorizer.fit(x_train)

TfidfVectorizer(lowercase=False, max_df=0.2, ngram_range=(1, 2))

In [None]:
x_train_vector=vectorizer.transform(x_train)
x_test_vector=vectorizer.transform(x_test)

In [None]:
model=LinearSVC()
model.fit(x_train_vector,y_train)
model.score(x_train_vector,y_train)

1.0

In [None]:
model.score(x_test_vector,y_test)

0.9075333333333333

In [None]:
import joblib
joblib.dump(model,'model.sav')
joblib.dump(vectorizer,'vectorizer.sav')

['vectorizer.sav']