# Importing various libraries

In [42]:
%matplotlib inline
import pandas as pd
import numpy as np
from textblob import TextBlob
import matplotlib as mpl
import matplotlib.pyplot as plt
import csv
import _pickle as cPickle
from scipy.io import loadmat
from sklearn.svm import SVC
import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer ,TfidfVectorizer,TfidfTransformer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, f1_score, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.learning_curve import learning_curve
from sklearn.naive_bayes import MultinomialNB

In [43]:
df=pd.read_csv("Training.txt",sep="\t", names=['liked','text'],encoding="utf-8");
df.head(3)

Unnamed: 0,liked,text
0,1,India is developing countries
1,1,The Da Vinci Code book is just awesome.
2,1,this was the first clive cussler i've ever rea...


This dataset is downloaded from https://www.kaggle.com/c/si650winter11/data
this is a TSV ("tab separated values") file, where the first column is a label saying whether the given review
is positive or negative. The second column is the review itself.
Data is tab separeted and therefore "\t" is passed as separator parameter to function.

In [44]:
print(len(df))

6931


Total no of reviews.

In [45]:
df.groupby('liked').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
liked,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,2975,559,I hate Harry Potter.,85
1,3956,744,I love Harry Potter.,167


# Data preprocessing

In [46]:
def tokens(review):
    return TextBlob(review).words

In [47]:
df.head().text.apply(tokens)

0                   [India, is, developing, countries]
1      [The, Da, Vinci, Code, book, is, just, awesome]
2    [this, was, the, first, clive, cussler, i, 've...
3             [i, liked, the, Da, Vinci, Code, a, lot]
4             [i, liked, the, Da, Vinci, Code, a, lot]
Name: text, dtype: object

Function tokens() is created to parse data/review into words.

In [48]:
TextBlob("ready was not a good movie").tags
#nltk.help.upenn_tagset('JJ')

[('ready', 'NN'),
 ('was', 'VBD'),
 ('not', 'RB'),
 ('a', 'DT'),
 ('good', 'JJ'),
 ('movie', 'NN')]

.tags is inbuilt function provided by TextBlob it is used to assign part of speech tags to the words in text.
It gives list of (word, POS) pairs.
To check meaning of particular tag nltk.help.upenn_tagset('tagname') can be used eg. nltk.help.upenn_tagset('JJ')

In [56]:
def split_into_lemmas(review):
    wordss = TextBlob(review.lower()).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in wordss]

df.text.head().apply(split_into_lemmas)

0                     [india, is, developing, country]
1      [the, da, vinci, code, book, is, just, awesome]
2    [this, wa, the, first, clive, cussler, i, 've,...
3             [i, liked, the, da, vinci, code, a, lot]
4             [i, liked, the, da, vinci, code, a, lot]
Name: text, dtype: object

Lemmatization is one of the important satge of data preprocessing in this step words are converted to their lemma(base form). For example "octopi" is converted to "octopus". similar method is stemming

NLTK also provide very powerful lemmatizer which make use of WORDNET eg.

In [67]:
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
lmtzr.lemmatize('octopi')
#nltk

'octopus'

# Converting text data into vectors 

In [66]:
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(df['text'])
print(len(bow_transformer.vocabulary_))

2122


In [63]:
review1=df['text'][3]
print(review1)

i liked the Da Vinci Code a lot.


In [64]:
bow1=bow_transformer.transform([review1])
print(bow1)
bow1.shape

  (0, 42)	1
  (0, 372)	1
  (0, 461)	1
  (0, 955)	1
  (0, 1127)	1
  (0, 1156)	1
  (0, 1844)	1
  (0, 1983)	1


(1, 2122)

In [31]:
print(bow_transformer.get_feature_names()[372])

code


In [32]:
review_bow = bow_transformer.transform(df['text'])
print( 'sparse matrix shape:', review_bow.shape)
print('number of non-zeros:', review_bow.nnz) #learn this
print( 'sparsity: %.2f%%' % (100.0 * review_bow.nnz))

sparse matrix shape: (6931, 2122)
number of non-zeros: 71287
sparsity: 7128700.00%


In [178]:
#operators=set(('no','not'))
#stopset=set(stopwords.words('english'))-operators
#vectorizer=TfidfVectorizer(use_idf=True,lowercase=True,strip_accents='ascii',stop_words=stopset)
#stopset

In [34]:
#rr=np.array([review])
tfidf_transformer =TfidfTransformer().fit(review_bow)
#print(tfidf1)

In [35]:
review_tfidf = tfidf_transformer.transform(review_bow)
review_tfidf.shape
#print(x)
#print(y)
#review_tfidf[3][0]

(6931, 2122)

In [36]:
text_train, text_test, liked_train, liked_test = train_test_split(df['text'], df['liked'], test_size=0.2)
#print(len(text_train), len(text_test), len(text_train) + len(text_test))


In [37]:
pipeline_svm = Pipeline([
    ('bow', CountVectorizer(analyzer=split_into_lemmas)),
    ('tfidf', TfidfTransformer()),
    ('classifier', SVC()),  # <== change here
])

# pipeline parameters to automatically explore and tune
param_svm = [
  {'classifier__C': [1, 10, 100, 1000], 'classifier__kernel': ['linear']},
  {'classifier__C': [1, 10, 100, 1000], 'classifier__gamma': [0.001, 0.0001], 'classifier__kernel': ['rbf']},
]

grid_svm = GridSearchCV(
    pipeline_svm,
    param_grid=param_svm,
    refit=True,  # fit using all data, on the best detected classifier
    n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
    scoring='accuracy',#optimizing
    cv=StratifiedKFold(liked_train, n_folds=5),
)

In [38]:
%time classifier = grid_svm.fit(text_train, liked_train) # find the best combination from param_svm
print(classifier.grid_scores_)

CPU times: user 2.28 s, sys: 67.5 ms, total: 2.35 s
Wall time: 1min 13s
[mean: 0.99134, std: 0.00218, params: {'classifier__C': 1, 'classifier__kernel': 'linear'}, mean: 0.99044, std: 0.00168, params: {'classifier__C': 10, 'classifier__kernel': 'linear'}, mean: 0.99044, std: 0.00168, params: {'classifier__C': 100, 'classifier__kernel': 'linear'}, mean: 0.99044, std: 0.00168, params: {'classifier__C': 1000, 'classifier__kernel': 'linear'}, mean: 0.57468, std: 0.00024, params: {'classifier__C': 1, 'classifier__gamma': 0.001, 'classifier__kernel': 'rbf'}, mean: 0.57468, std: 0.00024, params: {'classifier__C': 1, 'classifier__gamma': 0.0001, 'classifier__kernel': 'rbf'}, mean: 0.97312, std: 0.00572, params: {'classifier__C': 10, 'classifier__gamma': 0.001, 'classifier__kernel': 'rbf'}, mean: 0.57468, std: 0.00024, params: {'classifier__C': 10, 'classifier__gamma': 0.0001, 'classifier__kernel': 'rbf'}, mean: 0.98810, std: 0.00155, params: {'classifier__C': 100, 'classifier__gamma': 0.001, '

In [39]:
print(classification_report(liked_test, classifier.predict(text_test)))

             precision    recall  f1-score   support

          0       1.00      0.99      1.00       617
          1       0.99      1.00      1.00       770

avg / total       1.00      1.00      1.00      1387



In [20]:
print(classifier.predict(["the vinci code is awesome"])[0])

1


In [21]:
print(classifier.predict(["the vinci code is bad"])[0])

1


In [22]:
def gaussKernel(x1, x2, sigma):
    ss=np.power(sigma,2)
    norm= (x1-x2).T.dot(x1-x2)
    return np.exp(-norm/(2*ss))
x1 = np.array([1, 2, 1])
x2 = np.array([0, 4, -1])
sigma = 2
gaussKernel(x1,x2,sigma)

0.32465246735834974