In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['IMDB Dataset.csv']


In [2]:
file_path = '../input/IMDB Dataset.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
y = data.sentiment
y.head()

0    positive
1    positive
2    positive
3    negative
4    positive
Name: sentiment, dtype: object

In [4]:
label = {'positive':1, 'negative':-1}

def preprocess_y(sentiment):
    return label[sentiment]

y = y.apply(preprocess_y)
y.head()

0    1
1    1
2    1
3   -1
4    1
Name: sentiment, dtype: int64

In [5]:
X = data.review
X.head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [6]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
import re
def preprocess(review):
    #convert the tweet to lower case
    review.lower()
    #convert all urls to sting "URL"
    review = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',review)
    #convert all @username to "AT_USER"
    review = re.sub('@[^\s]+','AT_USER', review)
    #correct all multiple white spaces to a single white space
    review = re.sub('[\s]+', ' ', review)
    #convert "#topic" to just "topic"
    review = re.sub(r'#([^\s]+)', r'\1', review)
    tokens = word_tokenize(review)
    tokens = [w for w in tokens if not w in stop_words]
    return " ".join(tokens)

X = X.apply(preprocess)
X.head()

0    One reviewers mentioned watching 1 Oz episode ...
1    A wonderful little production . < br / > < br ...
2    I thought wonderful way spend time hot summer ...
3    Basically 's family little boy ( Jake ) thinks...
4    Petter Mattei 's `` Love Time Money '' visuall...
Name: review, dtype: object

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
def feature_extraction(data):
    tfv=TfidfVectorizer(sublinear_tf=True, stop_words = "english")
    features=tfv.fit_transform(data)
    pickle.dump(tfv.vocabulary_, open("svm_feature.pkl", "wb"))
    return features

data = np.array(X)
label = np.array(y)
features = feature_extraction(data)

print(features)

  (0, 75175)	0.07683783127650294
  (0, 57308)	0.06624866868047863
  (0, 97762)	0.06602816279570944
  (0, 65154)	0.25091002117171085
  (0, 29979)	0.09795330176273473
  (0, 52987)	0.09497914147180293
  (0, 42657)	0.08406289522607628
  (0, 75579)	0.07345438601129015
  (0, 30834)	0.05840115698708446
  (0, 40269)	0.058164466550608064
  (0, 11951)	0.0580768672598578
  (0, 89993)	0.039169060226213505
  (0, 86227)	0.14233085404173718
  (0, 12884)	0.0934397851970873
  (0, 94168)	0.11301369457806576
  (0, 78409)	0.038508427052991796
  (0, 96683)	0.14130718274380136
  (0, 79965)	0.04717873820228534
  (0, 99600)	0.10049092314544059
  (0, 92537)	0.07374935491777986
  (0, 31776)	0.0999580872999477
  (0, 40989)	0.07831507849669773
  (0, 90563)	0.10724962800570012
  (0, 71260)	0.08008437633828311
  (0, 71307)	0.09436601708316866
  :	:
  (49999, 94206)	0.1149017853264476
  (49999, 51790)	0.10313172784975949
  (49999, 45842)	0.20268706796083003
  (49999, 31177)	0.11845633689190246
  (49999, 7096)	0.1297

In [9]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size = 0.20) 

In [10]:
from sklearn.svm import SVC  
svclassifier = SVC(kernel='linear')  

svclassifier.fit(X_train, y_train)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [11]:
from sklearn.metrics import accuracy_score
val_pred = svclassifier.predict(X_test)
#print(val_pred)
print(accuracy_score(y_test, val_pred))

0.8966


In [12]:
filename = 'svm_model.sav'
pickle.dump(svclassifier, open(filename, 'wb'))

Saving the model

In [13]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.8966


Loading the saved model and testing

In [14]:
text = 'you are really beautiful'
text = preprocess(text)
print(text)
text = np.array([text])
print(text)

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfv_loaded = TfidfVectorizer(sublinear_tf=True, stop_words = "english", vocabulary=pickle.load(open("svm_feature.pkl", "rb")))
text = transformer.fit_transform(tfv_loaded.fit_transform(text))
print(text)
polarity = loaded_model.predict(text)
print(polarity)

really beautiful
['really beautiful']
  (0, 8827)	0.7071067811865476
  (0, 73032)	0.7071067811865476
[1]
