# Finding The Best N-Gram for Vectorizing (without tuning)

## Importing necessary libraries

In [5]:
## Arrays
import numpy as np

## DataFrames
import pandas as pd

## Visualizations

from IPython.display import Image

## Modeling
from sklearn import model_selection
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

## Warnings
import warnings
from scipy import stats
warnings.filterwarnings('ignore')

## Reading and Preparing the Data

In [2]:
#reading the data
df=pd.read_csv('nlp_reviews_cleaned.csv', delimiter=',')

#creating the classes
df['rating_class'] = df['rating'].apply(lambda x: 0 if x <= 2 else 1)
print(df.rating_class.value_counts())

#train data set reduced due to capacity of computing
df_train = df[0:5000]

#splitting data set into train and test sets
X = df_train['clean_text']
y = df_train['rating_class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)

1    12080
0     1192
Name: rating_class, dtype: int64


## 1. Finding the Best N-Gram for CountVectorizer

In [3]:
#creating four different CountVectors for each n-gram

count_vect1 = CountVectorizer(ngram_range=(1,1))
count_vect_train1 = count_vect1.fit_transform(X_train)
count_vect_train1 = count_vect_train1.toarray()
count_vect_test1 = count_vect1.transform(X_test)
count_vect_test1 = count_vect_test1.toarray()

count_vect2 = CountVectorizer(ngram_range=(2,2))
count_vect_train2 = count_vect2.fit_transform(X_train)
count_vect_train2 = count_vect_train2.toarray()
count_vect_test2 = count_vect2.transform(X_test)
count_vect_test2 = count_vect_test2.toarray()

count_vect3 = CountVectorizer(ngram_range=(3,3))
count_vect_train3 = count_vect3.fit_transform(X_train)
count_vect_train3 = count_vect_train3.toarray()
count_vect_test3 = count_vect3.transform(X_test)
count_vect_test3 = count_vect_test3.toarray()

count_vect4 = CountVectorizer(ngram_range=(4,4))
count_vect_train4 = count_vect4.fit_transform(X_train)
count_vect_train4 = count_vect_train4.toarray()
count_vect_test4 = count_vect4.transform(X_test)
count_vect_test4 = count_vect_test4.toarray()

In [5]:
# Trying each vector with the same model
# only one algoritm was used due to computing capacity
# LogisticRegression was chosen as the best model at step1

logreg_CV = LogisticRegression()

print('\nCountVectorizer N-grams with Logistic regression\n')

labels = ['Uni-gram', 'Bi-gram', 'Tri-gram', 'Four-gram']

for cv_train, cv_test, label in zip([count_vect_train1, count_vect_train2, count_vect_train3, count_vect_train4], 
                           [count_vect_test1, count_vect_test2, count_vect_test3, count_vect_test4], labels):
    
    logreg_CV.fit(cv_train, y_train)
    y_pred_lr_CV = logreg_CV.predict(cv_test)
    print("Accuracy : {} | F-1 Score: {} '[{}]'".format(
        metrics.accuracy_score(y_test, y_pred_lr_CV), f1_score(y_test, y_pred_lr_CV, average='weighted'), label))


CountVectorizer N-grams with Logistic regression

Accuracy : 0.9 | F-1 Score: 0.8866628460110704 '[Uni-gram]'
Accuracy : 0.9056 | F-1 Score: 0.8651926577603998 '[Bi-gram]'
Accuracy : 0.9056 | F-1 Score: 0.862273837535014 '[Tri-gram]'
Accuracy : 0.9064 | F-1 Score: 0.8626862606277287 '[Four-gram]'


### Result: Best N-gram for CountVectorizer is Uni-gram

## 2. Finding the Best N-Gram for HashingVectorizer

In [8]:
#creating four different TfIdf-Vectors for each n-gram

tdidf_vect1 = TfidfVectorizer(ngram_range=(1,1))
tdidf_vect_train1 = tdidf_vect1.fit_transform(X_train)
tdidf_vect_train1 = tdidf_vect_train1.toarray()
tdidf_vect_test1 = tdidf_vect1.transform(X_test)
tdidf_vect_test1 = tdidf_vect_test1.toarray()

tdidf_vect2 = TfidfVectorizer(ngram_range=(2,2))
tdidf_vect_train2 = tdidf_vect2.fit_transform(X_train)
tdidf_vect_train2 = tdidf_vect_train2.toarray()
tdidf_vect_test2 = tdidf_vect2.transform(X_test)
tdidf_vect_test2 = tdidf_vect_test2.toarray()

tdidf_vect3 = TfidfVectorizer(ngram_range=(3,3))
tdidf_vect_train3 = tdidf_vect3.fit_transform(X_train)
tdidf_vect_train3 = tdidf_vect_train3.toarray()
tdidf_vect_test3 = tdidf_vect3.transform(X_test)
tdidf_vect_test3 = tdidf_vect_test3.toarray()

tdidf_vect4 = TfidfVectorizer(ngram_range=(4,4))
tdidf_vect_train4 = tdidf_vect4.fit_transform(X_train)
tdidf_vect_train4 = tdidf_vect_train4.toarray()
tdidf_vect_test4 = tdidf_vect4.transform(X_test)
tdidf_vect_test4 = tdidf_vect_test4.toarray()

In [9]:
# Trying each vector with the same model

logreg_CV = LogisticRegression()

print('\nTd-idf N-grams with Logistic regression\n')

labels = ['Uni-gram', 'Bi-gram', 'Tri-gram', 'Four-gram']

for tfv_train, tfv_test, label in zip([tdidf_vect_train1, tdidf_vect_train2, tdidf_vect_train3, tdidf_vect_train4], 
                           [tdidf_vect_test1, tdidf_vect_test2, tdidf_vect_test3, tdidf_vect_test4], labels):
    
    logreg_CV.fit(tfv_train, y_train)
    y_pred_lr_CV = logreg_CV.predict(tfv_test)
    print("Accuracy : {} | F-1 Score: {} '[{}]'".format(
        metrics.accuracy_score(y_test, y_pred_lr_CV), f1_score(y_test, y_pred_lr_CV, average='weighted'), label))


Td-idf N-grams with Logistic regression

Accuracy : 0.9064 | F-1 Score: 0.8626862606277287 '[Uni-gram]'
Accuracy : 0.9056 | F-1 Score: 0.8607382031905961 '[Bi-gram]'
Accuracy : 0.9056 | F-1 Score: 0.8607382031905961 '[Tri-gram]'
Accuracy : 0.9056 | F-1 Score: 0.8607382031905961 '[Four-gram]'


### Result: Best N-gram for TfIdfVectorizer is Uni-gram

## 3. Finding the Best N-Gram for TfIdf-Vectorizer

In [3]:
#creating four different TfIdf-Vectors for each n-gram
hash_vect1 = HashingVectorizer(n_features=2000, ngram_range=(1,1))
hash_vect_train1 = hash_vect1.fit_transform(X_train)
hash_vect_train1 = hash_vect_train1.toarray()
hash_vect_test1 = hash_vect1.transform(X_test)
hash_vect_test1 = hash_vect_test1.toarray()

hash_vect2 = HashingVectorizer(n_features=2000, ngram_range=(2,2))
hash_vect_train2 = hash_vect2.fit_transform(X_train)
hash_vect_train2 = hash_vect_train2.toarray()
hash_vect_test2 = hash_vect1.transform(X_test)
hash_vect_test2 = hash_vect_test2.toarray()

hash_vect3 = HashingVectorizer(n_features=2000, ngram_range=(3,3))
hash_vect_train3 = hash_vect3.fit_transform(X_train)
hash_vect_train3 = hash_vect_train3.toarray()
hash_vect_test3 = hash_vect3.transform(X_test)
hash_vect_test3 = hash_vect_test3.toarray()

hash_vect4 = HashingVectorizer(n_features=2000, ngram_range=(4,4))
hash_vect_train4 = hash_vect4.fit_transform(X_train)
hash_vect_train4 = hash_vect_train4.toarray()
hash_vect_test4 = hash_vect4.transform(X_test)
hash_vect_test4 = hash_vect_test4.toarray()

In [4]:
# Trying each vector with the same model

logreg_CV = LogisticRegression()

print('\nHashVectorizer N-grams with Logistic regression\n')

labels = ['Uni-gram', 'Bi-gram', 'Tri-gram', 'Four-gram']

for hv_train, hv_test, label in zip([hash_vect_train1, hash_vect_train2, hash_vect_train3, hash_vect_train4], 
                           [hash_vect_test1, hash_vect_test2, hash_vect_test3, hash_vect_test4], labels):
    
    logreg_CV.fit(hv_train, y_train)
    y_pred_lr_CV = logreg_CV.predict(hv_test)
    print("Accuracy : {} | F-1 Score: {} '[{}]'".format(
        metrics.accuracy_score(y_test, y_pred_lr_CV), f1_score(y_test, y_pred_lr_CV, average='weighted'), label))


HashVectorizer N-grams with Logistic regression

Accuracy : 0.9056 | F-1 Score: 0.862273837535014 '[Uni-gram]'
Accuracy : 0.9056 | F-1 Score: 0.8607382031905961 '[Bi-gram]'
Accuracy : 0.9056 | F-1 Score: 0.8607382031905961 '[Tri-gram]'
Accuracy : 0.9056 | F-1 Score: 0.8607382031905961 '[Four-gram]'


### Result: Best N-gram for HashVectorizer is Uni-gram

# CONCLUSION

## The best n-gram for all Vectorizers:

## Uni-gram