In [2]:
import pandas as pd
import numpy as np
import sqlite3


In [3]:
df = pd.read_csv(".\data\Reviews.csv")

In [4]:
df.shape

(568454, 10)

In [5]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [6]:
df['Score'].value_counts()

5    363122
4     80655
1     52268
3     42640
2     29769
Name: Score, dtype: int64

In [7]:
# Filter to get only those records for which score is 2 or 3
# For these, get 'Text', 'Score' columns
df_filtered = df.loc[df['Score'].isin([2,3]), ['Text', 'Score']]

In [8]:
# Verify the label distribution
df_negative = df_filtered.loc[df['Score']==2]
df_positive = df_filtered.loc[df['Score']==3]
df_negative['Score'].value_counts(), df_positive['Score'].value_counts()

(2    29769
 Name: Score, dtype: int64,
 3    42640
 Name: Score, dtype: int64)

In [9]:
# Check for nans
df_positive.isna().sum(), df_negative.isna().sum()

(Text     0
 Score    0
 dtype: int64,
 Text     0
 Score    0
 dtype: int64)

In [12]:
# Prepare training data
df_features = pd.concat([df_positive[:5000], df_negative[:5000]])

# Pop returns a Series
df_labels = df_features.pop('Score')

df_features.shape,df_labels.shape

((10000, 1), (10000,))

In [13]:
df_labels.value_counts()

3    5000
2    5000
Name: Score, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm  import SVC

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [15]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df_features, df_labels, random_state = 49, test_size = 0.2, stratify=df_labels)

In [16]:
X_train_raw.shape,X_test_raw.shape

((8000, 1), (2000, 1))

In [17]:
# Transform Text feature to Bag of Words
bow = CountVectorizer()
X_train = bow.fit_transform(X_train_raw)

In [19]:
X_train_flatten = X_train_raw.values.flatten()

In [20]:
X_train_flatten.shape

(8000,)

In [21]:
X_train = bow.fit_transform(X_train_flatten)

In [22]:
X_train.shape

(8000, 17926)

In [23]:
X_test_flatten  = X_test_raw.values.flatten()

In [24]:
X_test = bow.transform(X_test_flatten)

In [25]:
X_test.shape

(2000, 17926)

In [26]:
model = SVC()

pipeline = Pipeline([
    ('standard_scaler', StandardScaler(with_mean=False)),
    ('svm', model )
])
pipeline

Pipeline(memory=None,
         steps=[('standard_scaler',
                 StandardScaler(copy=True, with_mean=False, with_std=True)),
                ('svm',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [27]:
from sklearn.model_selection import GridSearchCV

hyper_params = {
    'svm__kernel': ['linear', 'rbf'],
    'svm__gamma': [50,5,10,0.5],
    'svm__C':[10,0.1,0.001]
}

for folds in range(3,6):
    grid = GridSearchCV(pipeline, cv= folds, scoring = 'accuracy', param_grid = hyper_params,n_jobs=-1)
    grid.fit(X_train,y_train)
    
    score = grid.score(X_test, y_test)
    
    print("Folds ", folds, " Score ", score)
    print("Best params ", grid.best_params_)

Folds  3  Score  0.6425
Best params  {'svm__C': 0.001, 'svm__gamma': 50, 'svm__kernel': 'linear'}
Folds  4  Score  0.6425
Best params  {'svm__C': 0.001, 'svm__gamma': 50, 'svm__kernel': 'linear'}
Folds  5  Score  0.6425
Best params  {'svm__C': 0.001, 'svm__gamma': 50, 'svm__kernel': 'linear'}
