# Importing

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import *
rs=123

In [2]:
df = pd.read_csv("Preprocessed_website_class_englishOnly.csv")
df.head()

Unnamed: 0,Category,English_only_website_text
0,15,official site good hotel accommodation big sav...
1,15,hotel book like use vacation work hard year lo...
2,15,hotel book like previously deal predominantly ...
3,15,cheap search compare find cheap flight find co...
4,15,bot create free account create free account si...


# One of the websites had only one word and it is not english so the text now is null

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408 entries, 0 to 1407
Data columns (total 2 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Category                   1408 non-null   int64 
 1   English_only_website_text  1407 non-null   object
dtypes: int64(1), object(1)
memory usage: 22.1+ KB


In [4]:
df.dropna(how='any', inplace=True) 

In [5]:
X=df['English_only_website_text']
y=df['Category']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state = rs)

In [7]:
def training_evaluating_pipeline(classifier, n_gram):
    char_tfidf = TfidfVectorizer(analyzer='char', ngram_range=n_gram)
    word_tfidf = TfidfVectorizer(analyzer='word', ngram_range=n_gram)
    tfidf = FeatureUnion([('char', char_tfidf), ('word', word_tfidf)])
    pipeline = Pipeline([('tfidf', tfidf), ('clf', classifier)])
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    print(accuracy_score(y_test, predictions))

# Define the models

In [8]:
vote=VotingClassifier(estimators=[('LR', LogisticRegression(random_state=rs)),('sgd', SGDClassifier()), ('LinearSVC', LinearSVC(random_state=rs))], voting='hard')
models=[LinearSVC(random_state=rs),SVC(random_state=rs), MultinomialNB(),LogisticRegression(random_state=rs),
        RandomForestClassifier(random_state=rs), XGBClassifier(random_state=rs), LGBMClassifier(random_state=rs), 
        DecisionTreeClassifier(random_state=rs), KNeighborsClassifier(),SGDClassifier(random_state=rs),vote]
names=['Linear SVM', 'SVM', 'Multi-NB', 'LR', 'RF', 'XGB', 'LGBM', 'DT', 'k-NN','sgd','Voting']

## Unigram TF-IDF

In [8]:
for i in range(len(models)):
    print(names[i], end=':')
    training_evaluating_pipeline(models[i],(1,1))

Linear SVM:0.9403409090909091
SVM:0.8693181818181818
Multi-NB:0.6818181818181818
LR:0.8892045454545454
RF:0.8494318181818182
XGB:0.8267045454545454
LGBM:0.8380681818181818
DT:0.6676136363636364
k-NN:0.8664772727272727
sgd:0.8693181818181818
Voting:0.9176136363636364


## Uni and bigram TF-IDF

In [9]:
for i in range(len(models)):
    print(names[i], end=':')
    training_evaluating_pipeline(models[i], n_gram=(1,2))

Linear SVM:0.9346590909090909
SVM:0.8494318181818182
Multi-NB:0.40625
LR:0.8863636363636364
RF:0.7897727272727273
XGB:0.8579545454545454
LGBM:0.8494318181818182
DT:0.5965909090909091
k-NN:0.8721590909090909
sgd:0.9204545454545454
Voting:0.90625


## Bigram TF-IDF

In [10]:
for i in range(len(models)):
    print(names[i], end=':')
    training_evaluating_pipeline(models[i], n_gram=(2,2))

Linear SVM:0.8636363636363636
SVM:0.7556818181818182
Multi-NB:0.15625
LR:0.8039772727272727
RF:0.6988636363636364
XGB:0.75
LGBM:0.7869318181818182
DT:0.45454545454545453
k-NN:0.7045454545454546
sgd:0.8380681818181818
Voting:0.8295454545454546


## Uni, bi, and trigram TF-IDF

In [11]:
for i in range(len(models)):
    print(names[i], end=':')
    training_evaluating_pipeline(models[i], n_gram=(1,3))

Linear SVM:0.9346590909090909
SVM:0.8494318181818182
Multi-NB:0.4090909090909091
LR:0.875
RF:0.7613636363636364
XGB:0.8267045454545454
LGBM:0.8522727272727273
DT:0.5397727272727273
k-NN:0.8607954545454546
sgd:0.9005681818181818
Voting:0.9147727272727273


## Bi and trigram TF-IDF

In [12]:
for i in range(len(models)):
    print(names[i], end=':')
    training_evaluating_pipeline(models[i], n_gram=(2,3))

Linear SVM:0.9090909090909091
SVM:0.8068181818181818
Multi-NB:0.2755681818181818
LR:0.8494318181818182
RF:0.7386363636363636
XGB:0.8153409090909091
LGBM:0.8323863636363636
DT:0.5454545454545454
k-NN:0.6789772727272727
sgd:0.9176136363636364
Voting:0.8892045454545454


## Trigram TF-IDF

In [13]:
for i in range(len(models)):
    print(names[i], end=':')
    training_evaluating_pipeline(models[i], n_gram=(3,3))

Linear SVM:0.9090909090909091
SVM:0.84375
Multi-NB:0.3096590909090909
LR:0.8721590909090909
RF:0.6818181818181818
XGB:0.8068181818181818
LGBM:0.8409090909090909
DT:0.5454545454545454
k-NN:0.5909090909090909
sgd:0.9005681818181818
Voting:0.90625


#### Based on these experiments I will optimize SVM, LR, Multi-NB, and k-NN with Unigram and Uni-Bigram

# Unigram

## Linear SVM

In [20]:
parameters = {'clf__C': [1.0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30], 'clf__penalty': ['l1', 'l2'] }

char_tfidf = TfidfVectorizer(analyzer='char', ngram_range=(1,1))
word_tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,1))
tfidf = FeatureUnion([('char', char_tfidf), ('word', word_tfidf)])
pipeline = Pipeline([('tfidf', tfidf), ('clf', LinearSVC(random_state=rs))])
grid = GridSearchCV(pipeline, parameters, cv=10, n_jobs=-1).fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_estimator_)

140 fits failed out of a total of 280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
140 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\muner\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\muner\anaconda3\lib\site-packages\sklearn\pipeline.py", line 406, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\muner\anaconda3\lib\site-packages\sklearn\svm\_classes.py", line 274, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "C:\Users\muner\anaconda3\lib\site-packages\sklearn\svm\_base.py", line 1223, in _fit_liblinear
    so

0.9147259658580413
Pipeline(steps=[('tfidf',
                 FeatureUnion(transformer_list=[('char',
                                                 TfidfVectorizer(analyzer='char')),
                                                ('word', TfidfVectorizer())])),
                ('clf', LinearSVC(random_state=123))])


In [8]:
print('Linear SVM optimal score:')
char_tfidf = TfidfVectorizer(analyzer='char', ngram_range=(1,1))
word_tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,1))
tfidf = FeatureUnion([('char', char_tfidf), ('word', word_tfidf)])
pipeline = Pipeline([('tfidf', tfidf), ('clf', LinearSVC(random_state=123))])
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
print('* Confusion matrix: \n', confusion_matrix(y_test, predictions))
print('* Accuracy: %0.2f' %(accuracy_score(y_test, predictions)*100))
print('* F1: %0.2f' %(f1_score(y_test, predictions, average='macro')*100))
print('* Precision: %0.2f' %(precision_score(y_test, predictions, average='macro')*100))
print('* Recall: %0.2f' %(recall_score(y_test, predictions, average='macro')*100))

Linear SVM optimal score:
* Confusion matrix: 
 [[ 4  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 25  0  2  0  0  0  0  0  1  0  0  0  0  0  0]
 [ 0  3 23  0  1  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  0 29  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 20  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 21  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  3  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  1  0  0 29  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 21  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  0  0  0 24  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0 16  0  1  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 15  1  0  0  0]
 [ 0  0  1  0  0  0  0  0  0  0  1  3 15  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0 27  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 33  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 26]]
* Accuracy: 94.03
* F1: 93.56
* Precision: 94.49
* Recall: 93.13


# SGD

In [23]:
parameters = {"clf__penalty": ["l2", "elasticnet","l1", None ], 'clf__max_iter': [50, 80, 100, 500, 1000]}

char_tfidf = TfidfVectorizer(analyzer='char', ngram_range=(1,1))
word_tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,1))
tfidf = FeatureUnion([('char', char_tfidf), ('word', word_tfidf)])
pipeline = Pipeline([('tfidf', tfidf), ('clf', SGDClassifier(random_state=rs))])
grid = GridSearchCV(pipeline, parameters, cv=10, n_jobs=-1).fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_estimator_)

0.8891015274034142
Pipeline(steps=[('tfidf',
                 FeatureUnion(transformer_list=[('char',
                                                 TfidfVectorizer(analyzer='char')),
                                                ('word', TfidfVectorizer())])),
                ('clf',
                 SGDClassifier(max_iter=50, penalty=None, random_state=123))])


In [25]:
print('SGD optimal score:')
char_tfidf = TfidfVectorizer(analyzer='char', ngram_range=(1,1))
word_tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,1))
tfidf = FeatureUnion([('char', char_tfidf), ('word', word_tfidf)])
pipeline = Pipeline([('tfidf', tfidf), ('clf', SGDClassifier(max_iter=50, penalty=None, random_state=123))])
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
print('* Confusion matrix: \n', confusion_matrix(y_test, predictions))
print('* Accuracy: %0.2f' %(accuracy_score(y_test, predictions)*100))
print('* F1: %0.2f' %(f1_score(y_test, predictions, average='macro')*100))
print('* Precision: %0.2f' %(precision_score(y_test, predictions, average='macro')*100))
print('* Recall: %0.2f' %(recall_score(y_test, predictions, average='macro')*100))

SGD optimal score:
* Confusion matrix: 
 [[ 4  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 24  1  2  0  0  0  0  0  1  0  0  0  0  0  0]
 [ 0  1 24  0  1  0  0  0  0  0  0  0  1  1  0  0]
 [ 0  0  0 28  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 20  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 20  0  0  0  0  0  0  0  0  0  1]
 [ 0  1  0  0  0  0  3  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  1  1  0 29  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 21  0  0  0  0  0  0  0]
 [ 0  2  0  0  0  0  0  0  0 23  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0 16  0  1  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0 15  0  0  0  0]
 [ 0  0  1  0  0  0  0  0  0  0  1  3 15  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0 27  0  0]
 [ 0  0  1  0  0  0  0  0  0  0  0  0  0  0 32  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0  0  0 25]]
* Accuracy: 92.61
* F1: 92.38
* Precision: 93.25
* Recall: 91.93


## LR

In [26]:
parameters = {'clf__penalty':[None, 'l1', 'l2'], 'clf__solver' :['newton-cg', 'lbfgs', 'sag', 'saga'], 'clf__C':[ 10, 1.0, 2, 3, 4, 5, 6, 7, 8, 9]}

char_tfidf = TfidfVectorizer(analyzer='char', ngram_range=(1,1))
word_tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,1))
tfidf = FeatureUnion([('char', char_tfidf), ('word', word_tfidf)])
pipeline = Pipeline([('tfidf', tfidf), ('clf', LogisticRegression(random_state=rs))])
grid = GridSearchCV(pipeline, parameters, cv=10, n_jobs=-1).fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_estimator_)

300 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\muner\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\muner\anaconda3\lib\site-packages\sklearn\pipeline.py", line 406, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\muner\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\muner\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line

0.9118957771787961
Pipeline(steps=[('tfidf',
                 FeatureUnion(transformer_list=[('char',
                                                 TfidfVectorizer(analyzer='char')),
                                                ('word', TfidfVectorizer())])),
                ('clf',
                 LogisticRegression(C=10, random_state=123, solver='sag'))])




In [27]:
print('LR optimal score:')
char_tfidf = TfidfVectorizer(analyzer='char', ngram_range=(1,1))
word_tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,1))
tfidf = FeatureUnion([('char', char_tfidf), ('word', word_tfidf)])
pipeline = Pipeline([('tfidf', tfidf), ('clf', LogisticRegression(C=10, random_state=123, solver='sag'))])
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
print('* Confusion matrix: \n', confusion_matrix(y_test, predictions))
print('* Accuracy: %0.2f' %(accuracy_score(y_test, predictions)*100))
print('* F1: %0.2f' %(f1_score(y_test, predictions, average='macro')*100))
print('* Precision: %0.2f' %(precision_score(y_test, predictions, average='macro')*100))
print('* Recall: %0.2f' %(recall_score(y_test, predictions, average='macro')*100))

LR optimal score:




* Confusion matrix: 
 [[ 4  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 27  0  1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  3 23  0  1  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  0 29  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 20  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0 20  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  3  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  1  0  0 29  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 21  0  0  0  0  0  0  0]
 [ 0  2  0  0  0  0  0  0  0 23  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0 16  0  1  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 16  0  0  0  0]
 [ 0  1  1  0  0  0  0  0  0  0  1  3 14  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0 27  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 33  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0  0  0 25]]
* Accuracy: 93.75
* F1: 93.38
* Precision: 94.76
* Recall: 92.86
