# IMPORTING LIBRARIES

In [26]:
import pandas as pd
import numpy as np
import scipy
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.linear_model import SGDClassifier

# READ THE PREPROCESSED DATA

In [2]:
df = pd.read_csv('preprocessed.csv')

# VISUALIZE FIRST ROW TO UNDERSTAND THE DATA FORMAT

In [3]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_non_ascii,question2_non_ascii
0,0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...


# GET THE LABELS

In [4]:
labels = df['is_duplicate']

# MERGE QUESTION 1 AND 2 DATASET

In [6]:
corpus = df.loc[:,['question1_non_ascii','question2_non_ascii']]

# CREATE TRAIN/TEST SETS

In [7]:
X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.3, random_state=42)

# CREATE TRAIN CORPUS USING TRAIN SET TO CREATE BOW (N GRAMS)

In [8]:
train_corpus = pd.concat((X_train['question1_non_ascii'], X_train['question2_non_ascii']))

# CREATE BOW (N GRAMS) (UNI, BI, TRIGRAM) FOR EACH

In [9]:
# CREATE BOW OBJECT FOR UNIGRAM
vect = CountVectorizer(analyzer='word', ngram_range=(1, 1), tokenizer = word_tokenize)
# CREATE BOW OBJECT FOR BIGRAM
vect_2 = CountVectorizer(analyzer='word', ngram_range=(1, 2), tokenizer = word_tokenize)
# CREATE BOW OBJECT FOR TRIGRAM
vect_3 = CountVectorizer(analyzer='word', ngram_range=(1, 3), tokenizer = word_tokenize)

# CREATE BOW FOR UNIGRAM
vect.fit(train_corpus)
# CREATE BOW FOR BIGRAM
vect_2.fit(train_corpus)
# CREATE BOW FOR TRIGRAM
vect_3.fit(train_corpus)

# TRANSFORM QUESTION 1 AND 2 INTO BOW (UNIGRAM) REPRESENTATION. (TRAINSET)
train = vect.transform(X_train['question1_non_ascii'].values)
train_2 = vect.transform(X_train['question2_non_ascii'].values)
# CREATE UNIGRAM TRAINSET
Unigram_train = scipy.sparse.hstack((train, train_2))

# TRANSFORM QUESTION 1 AND 2 INTO BOW (UNIGRAM) REPRESENTATION. (TESTSET)
test_1 = vect.transform(X_test['question1_non_ascii'].values)
test_2 = vect.transform(X_test['question2_non_ascii'].values)
# CREATE UNIGRAM TESTSET
Unigram_test = scipy.sparse.hstack((test_1, test_2))

# TRANSFORM QUESTION 1 AND 2 INTO BOW (BIGRAM) REPRESENTATION. (TRAINSET)
train = vect_2.transform(X_train['question1_non_ascii'].values)
train_2 = vect_2.transform(X_train['question2_non_ascii'].values)
# CREATE BIGRAM TRAINSET
Bigram_train = scipy.sparse.hstack((train, train_2))

# TRANSFORM QUESTION 1 AND 2 INTO BOW (BIGRAM) REPRESENTATION. (TESTSET)
test_1 = vect_2.transform(X_test['question1_non_ascii'].values)
test_2 = vect_2.transform(X_test['question2_non_ascii'].values)
# CREATE BIGRAM TESTSET
Bigram_test = scipy.sparse.hstack((test_1, test_2))

# TRANSFORM QUESTION 1 AND 2 INTO BOW (TRIGRAM) REPRESENTATION. (TRAINSET)
train = vect_3.transform(X_train['question1_non_ascii'].values)
train_2 = vect_3.transform(X_train['question2_non_ascii'].values)
# CREATE TRIGRAM TRAINSET
Trigram_train = scipy.sparse.hstack((train, train_2))

# TRANSFORM QUESTION 1 AND 2 INTO BOW (TRIGRAM) REPRESENTATION. (TESTSET)
test_1 = vect_3.transform(X_test['question1_non_ascii'].values)
test_2 = vect_3.transform(X_test['question2_non_ascii'].values)
# CREATE TRIGRAM TESTSET
Trigram_test = scipy.sparse.hstack((test_1, test_2))




# SVM SECTIONS

In [28]:
# CREATE VALIDATION METHOD THAT SPLIT TRAIN SET INTO TRAIN/VAL SET INTO (70/20 RELATIVE TO GLOBALLY DATASET)
SSP = StratifiedShuffleSplit(n_splits=3, test_size=0.22, random_state=42)
# LABELS' NAMES
target_names = ['Not duplicate', 'duplicate']
# SET TUNE PARAMETER FOR SVM
parameters = {'max_iter':[10000]}
# SET TUNE PARAMETER FOR LOGISTIC REGRESSION
parameter_LR = {'max_iter':[20]}

## SVM FOR UNIGRAM

In [17]:
clf = LinearSVC(random_state=42, tol=1e-3, max_iter=10000)
clf = GridSearchCV(clf, parameters, cv=SSP)
clf.fit(Unigram_train, y_train)
cv_results = clf.best_score_
clf = clf.best_estimator_
print(cv_results)

0.7285897847735304


In [18]:
print(classification_report(y_test, clf.predict(Unigram_test), target_names=target_names))

               precision    recall  f1-score   support

Not duplicate       0.80      0.78      0.79     76609
    duplicate       0.64      0.67      0.65     44678

     accuracy                           0.74    121287
    macro avg       0.72      0.72      0.72    121287
 weighted avg       0.74      0.74      0.74    121287



## SVM FOR BIGRAM

In [19]:
clf_2 = LinearSVC(random_state=42, tol=1e-3, max_iter=10000)
clf_2 = GridSearchCV(clf_2, parameters, cv=SSP)
clf_2.fit(Bigram_train, y_train)
cv_results = clf_2.best_score_
clf_2 = clf_2.best_estimator_
print(cv_results)

0.7609594174965199


In [20]:
print(classification_report(y_test, clf_2.predict(Bigram_test), target_names=target_names))

               precision    recall  f1-score   support

Not duplicate       0.82      0.81      0.82     76609
    duplicate       0.68      0.70      0.69     44678

     accuracy                           0.77    121287
    macro avg       0.75      0.75      0.75    121287
 weighted avg       0.77      0.77      0.77    121287



## SVM FOR TRIGRAM

In [21]:
clf_3 = LinearSVC(random_state=42, tol=1e-3, max_iter=10000)
clf_3 = GridSearchCV(clf_3, parameters, cv=SSP)
clf_3.fit(Trigram_train, y_train)
cv_results = clf_3.best_score_
clf_3 = clf_3.best_estimator_
print(cv_results)

0.7828032979976443


In [22]:
print(classification_report(y_test, clf_3.predict(Trigram_test), target_names=target_names))

               precision    recall  f1-score   support

Not duplicate       0.83      0.85      0.84     76609
    duplicate       0.72      0.70      0.71     44678

     accuracy                           0.79    121287
    macro avg       0.78      0.77      0.77    121287
 weighted avg       0.79      0.79      0.79    121287



## EXPORT SVM MODELS

In [25]:
joblib.dump(clf, 'SVM_Unigram.joblib')
joblib.dump(clf_2, 'SVM_Bigram.joblib')
joblib.dump(clf_3, 'SVM_Trigram.joblib')

['SVM_Trigram.joblib']

# LOGISTIC REGRESSION

## LR FOR UNIGRAM

In [29]:
clf_LR = SGDClassifier(penalty='l2',
                    loss = 'log_loss',
                    learning_rate='optimal',
                    max_iter=20,
                    alpha = 0.00001,
                    tol=1e-3,
                    n_jobs = 10)

clf_LR = GridSearchCV(clf_LR, parameter_LR, cv=SSP)
clf_LR.fit(Unigram_train, y_train)
cv_results = clf_LR.best_score_
clf_LR = clf_LR.best_estimator_
print(cv_results)



0.7478102580576079




In [30]:
print(classification_report(y_test, clf_LR.predict(Unigram_test), target_names=target_names))

               precision    recall  f1-score   support

Not duplicate       0.79      0.82      0.80     76609
    duplicate       0.67      0.62      0.64     44678

     accuracy                           0.75    121287
    macro avg       0.73      0.72      0.72    121287
 weighted avg       0.74      0.75      0.74    121287



## LR FOR BIGRAM

In [31]:
clf_LR_2 = SGDClassifier(penalty='l2',
                    loss = 'log_loss',
                    learning_rate='optimal',
                    max_iter=20,
                    alpha = 0.00001,
                    tol=1e-3,
                    n_jobs = 10)

clf_LR_2 = GridSearchCV(clf_LR_2, parameter_LR, cv=SSP)
clf_LR_2.fit(Bigram_train, y_train)
cv_results = clf_LR_2.best_score_
clf_LR_2 = clf_LR_2.best_estimator_
print(cv_results)

0.7835689045936395


In [32]:
print(classification_report(y_test, clf_LR_2.predict(Bigram_test), target_names=target_names))

               precision    recall  f1-score   support

Not duplicate       0.83      0.84      0.83     76609
    duplicate       0.72      0.71      0.71     44678

     accuracy                           0.79    121287
    macro avg       0.77      0.77      0.77    121287
 weighted avg       0.79      0.79      0.79    121287



## LR FOR TRIGRAM

In [33]:
clf_LR_3 = SGDClassifier(penalty='l2',
                    loss = 'log_loss',
                    learning_rate='optimal',
                    max_iter=20,
                    alpha = 0.00001,
                    tol=1e-3,
                    n_jobs = 10)

clf_LR_3 = GridSearchCV(clf_LR_3, parameter_LR, cv=SSP)
clf_LR_3.fit(Trigram_train, y_train)
cv_results = clf_LR_3.best_score_
clf_LR_3 = clf_LR_3.best_estimator_
print(cv_results)

0.7978209658421672


In [34]:
print(classification_report(y_test, clf_LR_3.predict(Trigram_test), target_names=target_names))

               precision    recall  f1-score   support

Not duplicate       0.82      0.88      0.85     76609
    duplicate       0.77      0.67      0.72     44678

     accuracy                           0.80    121287
    macro avg       0.79      0.78      0.78    121287
 weighted avg       0.80      0.80      0.80    121287



## EXPORT LR MODELS

In [35]:
joblib.dump(clf_LR, 'LR_Unigram.joblib')
joblib.dump(clf_LR_2, 'LR_Bigram.joblib')
joblib.dump(clf_LR_3, 'LR_Trigram.joblib')

['LR_Trigram.joblib']