In [9]:
%matplotlib inline
import pandas as pd
import numpy as np
from textblob import TextBlob
import matplotlib as mpl
import matplotlib.pyplot as plt
import csv
import _pickle as cPickle
from scipy.io import loadmat
from sklearn.svm import SVC
import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, f1_score, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import learning_curve  # Perbaikan
from sklearn.naive_bayes import MultinomialNB


In [10]:
df = pd.read_csv("../Training.txt", sep="\t", names=['liked', 'text'], encoding="utf-8")
df.head(3)
#df

Unnamed: 0,liked,text
0,1,India is developing countries
1,1,The Da Vinci Code book is just awesome.
2,1,this was the first clive cussler i've ever rea...


In [11]:
print(len(df))

6931


In [12]:
df.groupby('liked').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
liked,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,2975,559,I hate Harry Potter.,85
1,3956,744,I love Harry Potter.,167


In [13]:
def tokens(review):
    return TextBlob(review).words

In [14]:
df.text.apply(tokens)

0                      [India, is, developing, countries]
1         [The, Da, Vinci, Code, book, is, just, awesome]
2       [this, was, the, first, clive, cussler, i, 've...
3                [i, liked, the, Da, Vinci, Code, a, lot]
4                [i, liked, the, Da, Vinci, Code, a, lot]
                              ...                        
6926                   [Brokeback, Mountain, was, boring]
6927    [So, Brokeback, Mountain, was, really, depress...
6928    [As, I, sit, here, watching, the, MTV, Movie, ...
6929    [Ok, brokeback, mountain, is, such, a, horribl...
6930    [Oh, and, Brokeback, Mountain, was, a, terribl...
Name: text, Length: 6931, dtype: object

In [15]:
TextBlob("ready was not a good movie").tags
# list of (word, POS) pairs
#nltk.help.upenn_tagset('JJ')

[('ready', 'NN'),
 ('was', 'VBD'),
 ('not', 'RB'),
 ('a', 'DT'),
 ('good', 'JJ'),
 ('movie', 'NN')]

In [16]:
def split_into_lemmas(review):
    wordss = TextBlob(review.lower()).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in wordss]

df.text.head().apply(split_into_lemmas)

0                     [india, is, developing, country]
1      [the, da, vinci, code, book, is, just, awesome]
2    [this, wa, the, first, clive, cussler, i, 've,...
3             [i, liked, the, da, vinci, code, a, lot]
4             [i, liked, the, da, vinci, code, a, lot]
Name: text, dtype: object

In [17]:
def split_into(review):
    words = TextBlob(review.lower()).words
    # for each word, take its "base form" = lemma 
    return [word.lemmatize() for word in words]
df.text.head().apply(split_into)

#df.text.head().apply(lemman)

0                     [india, is, developing, country]
1      [the, da, vinci, code, book, is, just, awesome]
2    [this, wa, the, first, clive, cussler, i, 've,...
3             [i, liked, the, da, vinci, code, a, lot]
4             [i, liked, the, da, vinci, code, a, lot]
Name: text, dtype: object

In [18]:
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(df['text'])
print(len(bow_transformer.vocabulary_))

2114


In [30]:
#review1=df['text'][3]
#print(review1)

In [64]:
#bow1=bow_transformer.transform([review1])
#print(bow1)
#bow1.shape

In [20]:
print(bow_transformer.get_feature_names_out()[372])

code-other


In [21]:
review_bow = bow_transformer.transform(df['text'])
print( 'sparse matrix shape:', review_bow.shape)
print('number of non-zeros:', review_bow.nnz) #learn this
print( 'sparsity: %.2f%%' % (100.0 * review_bow.nnz))

sparse matrix shape: (6931, 2114)
number of non-zeros: 71297
sparsity: 7129700.00%


In [178]:
#operators=set(('no','not'))
#stopset=set(stopwords.words('english'))-operators
#vectorizer=TfidfVectorizer(use_idf=True,lowercase=True,strip_accents='ascii',stop_words=stopset)
#stopset

In [22]:
#rr=np.array([review])
tfidf_transformer =TfidfTransformer().fit(review_bow)
#print(tfidf1)

In [23]:
review_tfidf = tfidf_transformer.transform(review_bow)
review_tfidf.shape
#print(x)
#print(y)
#review_tfidf[3][0]

(6931, 2114)

In [24]:
text_train, text_test, liked_train, liked_test = train_test_split(df['text'], df['liked'], test_size=0.2)
#print(len(text_train), len(text_test), len(text_train) + len(text_test))


In [26]:
pipeline_svm = Pipeline([
    ('bow', CountVectorizer(analyzer=split_into_lemmas)),
    ('tfidf', TfidfTransformer()),
    ('classifier', SVC()),  # <== change here
])

# pipeline parameters to automatically explore and tune
param_svm = [
  {'classifier__C': [1, 10, 100, 1000], 'classifier__kernel': ['linear']},
  {'classifier__C': [1, 10, 100, 1000], 'classifier__gamma': [0.001, 0.0001], 'classifier__kernel': ['rbf']},
]

grid_svm = GridSearchCV(
    pipeline_svm,
    param_grid=param_svm,
    refit=True,  # fit using all data, on the best detected classifier
    n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
    scoring='accuracy',#optimizing
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
)

In [27]:
%time classifier = grid_svm.fit(text_train, liked_train) # find the best combination from param_svm
print(classifier.cv_results_)

CPU times: user 2.37 s, sys: 293 ms, total: 2.66 s
Wall time: 1min 45s
{'mean_fit_time': array([9.09090958, 2.93430772, 3.20370569, 3.1019588 , 5.59794369,
       5.96438861, 4.25880141, 5.16540484, 3.24390802, 5.42156463,
       4.79177933, 3.30473046]), 'std_fit_time': array([3.12433379, 0.16452944, 0.07765985, 0.11513414, 0.45016625,
       0.59442541, 0.0730408 , 0.11034164, 0.20950943, 1.75385986,
       1.70848039, 0.37657682]), 'mean_score_time': array([0.87315745, 0.74450302, 0.73311343, 0.74109883, 1.13803525,
       1.11629019, 1.06905699, 1.28852324, 0.78916841, 1.4451364 ,
       0.93111105, 0.74683928]), 'std_score_time': array([0.07218352, 0.02227658, 0.03062289, 0.0268775 , 0.01754715,
       0.0152506 , 0.05287688, 0.10290547, 0.02681891, 0.32831569,
       0.39309449, 0.15863583]), 'param_classifier__C': masked_array(data=[1, 10, 100, 1000, 1, 1, 10, 10, 100, 100, 1000, 1000],
             mask=[False, False, False, False, False, False, False, False,
                  

In [28]:
print(classification_report(liked_test, classifier.predict(text_test)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       594
           1       0.99      0.99      0.99       793

    accuracy                           0.99      1387
   macro avg       0.99      0.99      0.99      1387
weighted avg       0.99      0.99      0.99      1387



In [29]:
print(classifier.predict(["the vinci code is awesome"])[0])

1


In [30]:
print(classifier.predict(["the vinci code is bad"])[0])

0


In [31]:
def gaussKernel(x1, x2, sigma):
    ss=np.power(sigma,2)
    norm= (x1-x2).T.dot(x1-x2)
    return np.exp(-norm/(2*ss))
x1 = np.array([1, 2, 1])
x2 = np.array([0, 4, -1])
sigma = 2
gaussKernel(x1,x2,sigma)

0.32465246735834974