### Function to read data from file

In [1]:
from DeezyMatch.utils import normalizeString
import pandas as pd
import numpy as np

csv_sep = "\t"

def prepare_data_from_file(dataset_path):
    with open(dataset_path, "r", encoding="utf8") as ds_fio:
        df_list = ds_fio.readlines()
        for i in range(len(df_list)):
            tmp_split_row = df_list[i].split(csv_sep)
            df_list[i] = tmp_split_row[:3]
    dataset_pd = pd.DataFrame(df_list, columns=["s1", "s2", "label"])
    dataset_pd["s1"] = dataset_pd["s1"].str.strip()
    dataset_pd["s2"] = dataset_pd["s2"].str.strip()
    dataset_pd["label"] = dataset_pd["label"].str.strip()

    dataset_pd["combine"] = dataset_pd["s1"] + " + " + dataset_pd["s2"]
    dataset_pd["combine"] = dataset_pd["combine"].apply(normalizeString)

    X_train, y_train = np.asarray(dataset_pd["combine"]), np.asarray(dataset_pd["label"])
    for i in range(len(y_train)):
        y_train[i] = 1 if y_train[i] == "TRUE" else 0
    return X_train, y_train, np.asarray(dataset_pd["s1"])


### Build Word2Vec model

In [2]:
def simple_preprocessing(text):
    text = text.replace('"', '')
    text = text.replace('\'', '')
    return text

In [3]:
dataset_path = "./dataset/wikidata.txt"
X_train, y_train,_ = prepare_data_from_file(dataset_path)

X_tokenized = [[simple_preprocessing(w.strip()) for w in combine.split(" ")] for combine in X_train]

In [4]:
import gensim
model = gensim.models.Word2Vec(sentences = X_tokenized, vector_size=120, min_count=1)

### Prepare data for machine learning methods

In [5]:
def generate_vector(combine):
    tokens = combine.split(" ")
    vector = [model.wv[simple_preprocessing(tok.strip())] for tok in tokens]
    return vector

In [6]:
dataset_train_path = "./dataset/wikidata.txt"
dataset_test_path = "./dataset/wikidata-test.txt"

X_train, y_train,_ = prepare_data_from_file(dataset_train_path)
X_test, y_test, original_test = prepare_data_from_file(dataset_test_path)

# Convert y from 1D to 2D
y_train = [[x] for x in y_train]
y_test = [[x] for x in y_test]

# Convert word to vector by using pretrained Word2Vec model
X_train_vec = [generate_vector(x) for x in X_train]
X_test_vec = [generate_vector(x) for x in X_test]

# Convert word vector from 2D to 1D
X_train_vec_1d = [np.mean(x, axis=0) for x in X_train_vec]
X_test_vec_1d = [np.mean(x, axis=0) for x in X_test_vec]

In [11]:
print(len(X_train_vec_1d), len(y_train))
print(len(X_test_vec_1d), len(y_test))

93416 93416
14011 14011


In [12]:
# Calculate metrics
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import timeit

def calculate_metrics(label, predict):
    print('Accuracy score: ', accuracy_score(label, predict))
    print('Precision: ', precision_score(label, predict))
    print('Recall score: ', recall_score(label, predict))
    print('F1 score: ', f1_score(label, predict))

### SVM

In [65]:
from sklearn import svm

# Train SVM model
start_train = timeit.default_timer()
ml_svm = svm.SVC()
ml_svm.fit(X_train_vec_1d, y_train)
stop_train = timeit.default_timer()
print("Training time: ", stop_train - start_train)

# SVM predict
start_inference = timeit.default_timer()
pred_svm = ml_svm.predict(X_test_vec_1d)
stop_inference = timeit.default_timer()
print("Inference time: ", stop_inference - start_inference)

# Print metric
calculate_metrics(y_test, pred_svm)

  return f(*args, **kwargs)


Training time:  10417.2885908
Inference time:  277.2389511000001
Accuracy score:  0.5482146675219157
Precision:  0.5696871190572937
Recall score:  0.39914590747330964
F1 score:  0.46940654557629535


In [72]:
proba = ml_svm.predict_proba(X_test_vec_1d)

print(len(original_test), len(proba))
for i in range(20):
    print(proba[len(proba)-20+i], original_test[len(original_test)-20+i])

14031 14031
[0.52177907 0.47822093] Bill Gates
[0.41589054 0.58410946] Michael Jackson
[0.43081211 0.56918789] Micheal Jackson
[0.41118969 0.58881031] MichaelJackson
[0.41118125 0.58881875] Michael-Jackson
[0.41656256 0.58343744] Michael Joseph Jackson
[0.41162469 0.58837531] Michael Joe Jackson
[0.40433031 0.59566969] Jackson, Michael
[0.40718056 0.59281944] Jackson, Michael Joseph
[0.41217264 0.58782736] 米高·積遜
[0.45403463 0.54596537] Майкл Джексон
[0.49066628 0.50933372] Μάϊκλ Τζάκσον
[0.41135646 0.58864354] マイケルジャクソン
[0.40783592 0.59216408] M. J.
[0.40998943 0.59001057] M. Jackson
[0.41084461 0.58915539] Michael J. Jackson
[0.41998105 0.58001895] M. J. Jackson
[0.41724333 0.58275667] Mr. Jackson
[0.40971263 0.59028737] MJ
[0.43602273 0.56397727] King of Pop


In [76]:
for i in range(20):
    if (proba[len(proba)-20+i][1] >= 0.5):
        print(proba[len(proba)-20+i][1], original_test[len(original_test)-20+i])

0.5841094562892275 Michael Jackson
0.5691878946344464 Micheal Jackson
0.5888103082930961 MichaelJackson
0.5888187466353652 Michael-Jackson
0.5834374350904414 Michael Joseph Jackson
0.588375308622705 Michael Joe Jackson
0.5956696870814001 Jackson, Michael
0.5928194358396892 Jackson, Michael Joseph
0.5878273561124363 米高·積遜
0.5459653714887908 Майкл Джексон
0.5093337240090655 Μάϊκλ Τζάκσον
0.5886435446553923 マイケルジャクソン
0.5921640755095946 M. J.
0.5900105655119416 M. Jackson
0.589155391414248 Michael J. Jackson
0.5800189540805423 M. J. Jackson
0.5827566717223838 Mr. Jackson
0.5902873678620918 MJ
0.5639772736818391 King of Pop


### Random forest

In [67]:
from sklearn.ensemble import RandomForestClassifier

# Train Random forest model
start_train = timeit.default_timer()
model_random_forest = RandomForestClassifier()
model_random_forest.fit(X_train_vec_1d, y_train)
stop_train = timeit.default_timer()
print("Training time: ", stop_train - start_train)

# SVM predict
start_inference = timeit.default_timer()
pred_randomforest = model_random_forest.predict(X_test_vec_1d)
stop_inference = timeit.default_timer()
print("Inference time: ", stop_inference - start_inference)

# Print metric
calculate_metrics(y_test, pred_randomforest)

  model_random_forest.fit(X_train_vec_1d, y_train)


Training time:  159.2949731000008
Inference time:  0.4921281999995699
Accuracy score:  0.9997861877271755
Precision:  1.0
Recall score:  0.9995729537366548
F1 score:  0.9997864312664626


In [69]:
proba = model_random_forest.predict_proba(X_test_vec_1d)

print(len(original_test), len(proba))

for i in range(20):
    print(original_test[len(original_test)-20+i], proba[len(proba)-20+i],)

14031 14031
Bill Gates [0.54 0.46]
Michael Jackson [0.26 0.74]
Micheal Jackson [0.48 0.52]
MichaelJackson [0.17 0.83]
Michael-Jackson [0.16 0.84]
Michael Joseph Jackson [0.2 0.8]
Michael Joe Jackson [0.23 0.77]
Jackson, Michael [0.47 0.53]
Jackson, Michael Joseph [0.37 0.63]
米高·積遜 [0.21 0.79]
Майкл Джексон [0.64 0.36]
Μάϊκλ Τζάκσον [0.36 0.64]
マイケルジャクソン [0.19 0.81]
M. J. [0.37 0.63]
M. Jackson [0.56 0.44]
Michael J. Jackson [0.23 0.77]
M. J. Jackson [0.24 0.76]
Mr. Jackson [0.28 0.72]
MJ [0.21 0.79]
King of Pop [0.12 0.88]


In [58]:
for i in range(20):
    if(proba[len(proba)-20+i][1] >= 0.7):
        print(original_test[len(original_test)-20+i], proba[len(proba)-20+i][1],)

MichaelJackson 0.79
Michael-Jackson 0.77
Michael Joseph Jackson 0.71
Michael Joe Jackson 0.72
米高·積遜 0.74
マイケルジャクソン 0.79
M. J. Jackson 0.78
Mr. Jackson 0.7
MJ 0.77
King of Pop 0.84


### Gradient Boosted Trees

In [32]:
from sklearn.ensemble import GradientBoostingClassifier

# Train Gradient boosted trees model
start_train = timeit.default_timer()
model_gradient_boost = GradientBoostingClassifier()
model_gradient_boost.fit(X_train_vec_1d, y_train)
stop_train = timeit.default_timer()
print("Training time: ", stop_train - start_train)

# Gradient boosted trees predict
start_inference = timeit.default_timer()
pred_gradientboostedtrees = model_gradient_boost.predict(X_test_vec_1d)
stop_inference = timeit.default_timer()
print("Inference time: ", stop_inference - start_inference)

# Print metric
calculate_metrics(y_test, pred_gradientboostedtrees)    

  return f(*args, **kwargs)


Training time:  439.59463860000005
Inference time:  0.03771039999992354
Accuracy score:  0.5838500463259925
Precision:  0.6360715924736118
Recall score:  0.39459074733096083
F1 score:  0.48704208029517704


### Naive Bayes

In [48]:
from sklearn.naive_bayes import GaussianNB

# Train Naive Bayes model
start_train = timeit.default_timer()
model_naive_bayes = GaussianNB()
model_naive_bayes.fit(X_train_vec_1d, y_train)
stop_train = timeit.default_timer()
print("Training time: ", stop_train - start_train)

# Naive Bayes predict
start_inference = timeit.default_timer()
pred_naive_bayes = model_naive_bayes.predict(X_test_vec_1d)
stop_inference = timeit.default_timer()
print("Inference time: ", stop_inference - start_inference)

# Print metric
calculate_metrics(y_test, pred_naive_bayes)

  return f(*args, **kwargs)


Training time:  0.3025149000000056
Inference time:  0.07010020000006989
Accuracy score:  0.53994725963937
Precision:  0.5633614939973322
Recall score:  0.36071174377224197
F1 score:  0.43981602013364574
