### Function to read data from file

In [12]:
from DeezyMatch.utils import normalizeString
import pandas as pd
import numpy as np

csv_sep = "\t"

def prepare_data_from_file(dataset_path):
    with open(dataset_path, "r", encoding="utf8") as ds_fio:
        df_list = ds_fio.readlines()
        for i in range(len(df_list)):
            tmp_split_row = df_list[i].split(csv_sep)
            df_list[i] = tmp_split_row[:3]
    dataset_pd = pd.DataFrame(df_list, columns=["s1", "s2", "label"])
    dataset_pd["s1"] = dataset_pd["s1"].str.strip()
    dataset_pd["s2"] = dataset_pd["s2"].str.strip()
    dataset_pd["label"] = dataset_pd["label"].str.strip()

    dataset_pd["combine"] = dataset_pd["s1"] + " + " + dataset_pd["s2"]
    dataset_pd["combine"] = dataset_pd["combine"].apply(normalizeString)

    X_train, y_train = np.asarray(dataset_pd["combine"]), np.asarray(dataset_pd["label"])
    for i in range(len(y_train)):
        y_train[i] = 1 if y_train[i] == "TRUE" else 0
    return X_train, y_train


### Build Word2Vec model

In [13]:
dataset_path = "./dataset/dataset-unfiltered.txt"
X_train, y_train = prepare_data_from_file(dataset_path)

X_tokenized = [[w.strip() for w in combine.split(" ")] for combine in X_train]

In [14]:
import gensim
model = gensim.models.Word2Vec(sentences = X_tokenized, vector_size=120, min_count=1)

### Prepare data for machine learning methods

In [15]:
def generate_vector(combine):
    tokens = combine.split(" ")
    vector = [model.wv[tok.strip()] for tok in tokens]
    return vector

In [16]:
dataset_train_path = "./dataset/finetuned-train.txt"
dataset_test_path = "./dataset/finetuned-test.txt"

X_train, y_train = prepare_data_from_file(dataset_train_path)
X_test, y_test = prepare_data_from_file(dataset_test_path)

# Convert y from 1D to 2D
y_train = [[x] for x in y_train]
y_test = [[x] for x in y_test]

# Convert word to vector by using pretrained Word2Vec model
X_train_vec = [generate_vector(x) for x in X_train]
X_test_vec = [generate_vector(x) for x in X_test]

# Convert word vector from 2D to 1D
X_train_vec_1d = [np.mean(x, axis=0) for x in X_train_vec]
X_test_vec_1d = [np.mean(x, axis=0) for x in X_test_vec]

In [17]:
print(len(X_train_vec_1d), len(y_train))
print(len(X_test_vec_1d), len(y_test))

385023 385023
82504 82504


In [19]:
# Calculate metrics
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import timeit

def calculate_metrics(label, predict):
    print('Accuracy score: ', accuracy_score(label, predict))
    print('Precision: ', precision_score(label, predict))
    print('Recall score: ', recall_score(label, predict))
    print('F1 score: ', f1_score(label, predict))

### SVM

In [20]:
from sklearn import svm

# Train SVM model
start_train = timeit.default_timer()
ml_svm = svm.SVC()
ml_svm.fit(X_train_vec_1d, y_train)
stop_train = timeit.default_timer()
print("Training time: ", stop_train - start_train)

# SVM predict
start_inference = timeit.default_timer()
pred_svm = ml_svm.predict(X_test_vec_1d)
stop_inference = timeit.default_timer()
print("Inference time: ", stop_inference - start_inference)

# Print metric
calculate_metrics(y_test, pred_svm)

  return f(*args, **kwargs)


Training time:  13954.243618999999
Inference time:  2614.9085180999973
Accuracy score:  0.8495466886453991
Precision:  0.8474996987588866
Recall score:  0.85249200038786
F1 score:  0.8499885192212407


### Random forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

# Train Random forest model
start_train = timeit.default_timer()
model_random_forest = RandomForestClassifier()
model_random_forest.fit(X_train_vec_1d, y_train)
stop_train = timeit.default_timer()
print("Training time: ", stop_train - start_train)

# SVM predict
start_inference = timeit.default_timer()
pred_randomforest = model_random_forest.predict(X_test_vec_1d)
stop_inference = timeit.default_timer()
print("Inference time: ", stop_inference - start_inference)

# Print metric
calculate_metrics(y_test, pred_randomforest)

  model_random_forest.fit(X_train_vec_1d, y_train)


Training time:  866.4750711000015
Inference time:  3.0066845000001194
Accuracy score:  0.8640187142441579
Precision:  0.8880533374680207
Recall score:  0.8330505187627266
F1 score:  0.8596730415639969


### Gradient Boosted Trees

In [22]:
from sklearn.ensemble import GradientBoostingClassifier

# Train Gradient boosted trees model
start_train = timeit.default_timer()
model_gradient_boost = GradientBoostingClassifier()
model_gradient_boost.fit(X_train_vec_1d, y_train)
stop_train = timeit.default_timer()
print("Training time: ", stop_train - start_train)

# Gradient boosted trees predict
start_inference = timeit.default_timer()
pred_gradientboostedtrees = model_gradient_boost.predict(X_test_vec_1d)
stop_inference = timeit.default_timer()
print("Inference time: ", stop_inference - start_inference)

# Print metric
calculate_metrics(y_test, pred_gradientboostedtrees)    

  return f(*args, **kwargs)


Training time:  1978.349807999999
Inference time:  0.3221551000024192
Accuracy score:  0.7665688936293998
Precision:  0.8057045953684913
Recall score:  0.7025598758848056
F1 score:  0.7506053895860042


### Naive Bayes

In [25]:
from sklearn.naive_bayes import GaussianNB

# Train Naive Bayes model
start_train = timeit.default_timer()
model_naive_bayes = GaussianNB()
model_naive_bayes.fit(X_train_vec_1d, y_train)
stop_train = timeit.default_timer()
print("Training time: ", stop_train - start_train)

# Naive Bayes predict
start_inference = timeit.default_timer()
pred_naive_bayes = model_naive_bayes.predict(X_test_vec_1d)
stop_inference = timeit.default_timer()
print("Inference time: ", stop_inference - start_inference)

# Print metric
calculate_metrics(y_test, pred_naive_bayes)

  return f(*args, **kwargs)


Training time:  1.1154857000001357
Inference time:  0.40788859999884153
Accuracy score:  0.5854382817802773
Precision:  0.6142056317034444
Recall score:  0.4594928730728207
F1 score:  0.5257027165698279
