# Demonstration of data loading and model training with BERT vectors

In [8]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import sklearn.metrics

In [9]:
ORIGINAL_DATA_DIR = os.path.join("..", "handout", "data")
BERT_FEATURE_DIR = "bert_output_data"

## Format training data

`X` will be a matrix with `N` rows for the `N` texts in the training data, and `M` columns for the `M` features generated by BERT.

`y` will be an array of `N` class labels for training.

In [10]:
train_df = pd.read_csv(os.path.join(ORIGINAL_DATA_DIR, "lang_id_train.csv"))
test_df = pd.read_csv(os.path.join(ORIGINAL_DATA_DIR, "lang_id_test.csv"))

In [11]:
print(train_df.shape)

(6000, 2)


In [12]:
bert_vectors = []
with open(os.path.join(BERT_FEATURE_DIR, "train.jsonlines"), "rt") as infile:
    for line in infile:
        bert_data = json.loads(line)
        for t in bert_data["features"]:
            # Only extract the [CLS] vector used for classification
            if t["token"] == "[CLS]":
                # We only use the representation at the final layer of the network
                bert_vectors.append(t["layers"][0]["values"])
                break

In [13]:
print(len(bert_vectors))

6000


In [14]:
X = np.array(bert_vectors)
y = train_df["native_language"].values

## Train logistic regression model

In [15]:
lr_model = LogisticRegression(penalty="l2", C=1.0)
lr_model.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
test_vectors = []
with open(os.path.join(BERT_FEATURE_DIR, "test.jsonlines"), "rt") as infile:
    for line in infile:
        bert_data = json.loads(line)
        for t in bert_data["features"]:
            # Only extract the [CLS] vector used for classification
            if t["token"] == "[CLS]":
                # We only use the representation at the final layer of the network
                test_vectors.append(t["layers"][0]["values"])
                break

In [17]:
print(len(test_vectors))

2000


In [18]:
x_test = np.array(test_vectors)
test_labels = lr_model.predict(x_test)
y_test = test_df["native_language"].values
accuracy = lr_model.score(x_test, y_test)
print("Accuracy in percentage: " + str(accuracy*100))

Accuracy in percentage: 46.6


In [19]:
test_df['predicted_language'] = test_labels
test_df['result']=np.where(test_df['native_language'] == test_df['predicted_language'],'Yes','No')

In [20]:
arabic_df = test_df[test_df.native_language.isin(['Arabic'])]
cantonese_df = test_df[test_df.native_language.isin(['Cantonese'])]
japanese_df = test_df[test_df.native_language.isin(['Japanese'])]
korean_df = test_df[test_df.native_language.isin(['Korean'])]
mandarin_df = test_df[test_df.native_language.isin(['Mandarin'])]
polish_df = test_df[test_df.native_language.isin(['Polish'])]
russian_df = test_df[test_df.native_language.isin(['Russian'])]
spanish_df = test_df[test_df.native_language.isin(['Spanish'])]
thai_df = test_df[test_df.native_language.isin(['Thai'])]
vietnamese_df = test_df[test_df.native_language.isin(['Vietnamese'])]

In [21]:
# print(arabic_df)


In [22]:
confusion_matrix = sklearn.metrics.confusion_matrix(test_df['native_language'], test_df['predicted_language'])

In [23]:
print("Confusion Matrix:")
print(confusion_matrix)

Confusion Matrix:
[[ 99  11   9   7  12  14  10  16   9  13]
 [ 11  68  14  12  46  12  10   4   9  14]
 [  8  13  99  23  11  17  10   5   5   9]
 [  7  17  23  91  13   8  12   9  12   8]
 [ 12  36  19  14  63   9  10  14   6  17]
 [ 11  14   8   3   5 102  29  11   8   9]
 [ 10   8  14   5   6  23 114  14   0   6]
 [ 17   2  11   5  14  17  14 104   5  11]
 [ 12  10   7  15   9   2   1  13 120  11]
 [ 12  29  11  13  19  13  12  11   8  72]]


In [24]:
misclassification_rate = confusion_matrix/200
print("Frequencies of misclassifications between each pair of classes: \n", misclassification_rate)

Frequencies of misclassifications between each pair of classes: 
 [[0.495 0.055 0.045 0.035 0.06  0.07  0.05  0.08  0.045 0.065]
 [0.055 0.34  0.07  0.06  0.23  0.06  0.05  0.02  0.045 0.07 ]
 [0.04  0.065 0.495 0.115 0.055 0.085 0.05  0.025 0.025 0.045]
 [0.035 0.085 0.115 0.455 0.065 0.04  0.06  0.045 0.06  0.04 ]
 [0.06  0.18  0.095 0.07  0.315 0.045 0.05  0.07  0.03  0.085]
 [0.055 0.07  0.04  0.015 0.025 0.51  0.145 0.055 0.04  0.045]
 [0.05  0.04  0.07  0.025 0.03  0.115 0.57  0.07  0.    0.03 ]
 [0.085 0.01  0.055 0.025 0.07  0.085 0.07  0.52  0.025 0.055]
 [0.06  0.05  0.035 0.075 0.045 0.01  0.005 0.065 0.6   0.055]
 [0.06  0.145 0.055 0.065 0.095 0.065 0.06  0.055 0.04  0.36 ]]


In [25]:
precision,recall,fscore,support = sklearn.metrics.precision_recall_fscore_support(test_df['native_language'], test_df['predicted_language'])

In [61]:
print("\"Arabic\" Class Metrics:")
print("\tMissClassification Rate:", 1-sklearn.metrics.accuracy_score(arabic_df['native_language'], arabic_df['predicted_language']))
print("\tPrecision:", precision[0])
print("\tRecall:", recall[0])
print("\tFscore:", fscore[0])

print("\n\"Cantonese\" Class Metrics:")
print("\tMissClassification Rate:", 1-sklearn.metrics.accuracy_score(cantonese_df['native_language'], cantonese_df['predicted_language']))
print("\tPrecision:", precision[1])
print("\tRecall:", recall[1])
print("\tFscore:", fscore[1])

print("\n\"Japanese\" Class Metrics:")
print("\tMissClassification Rate:", 1-sklearn.metrics.accuracy_score(japanese_df['native_language'], japanese_df['predicted_language']))
print("\tPrecision:", precision[2])
print("\tRecall:", recall[2])
print("\tFscore:", fscore[2])

print("\n\"Korean\" Class Metrics:")
print("\tMissClassification Rate:", 1-sklearn.metrics.accuracy_score(korean_df['native_language'], korean_df['predicted_language']))
print("\tPrecision:", precision[3])
print("\tRecall:", recall[3])
print("\tFscore:", fscore[3])


print("\n\"Mandarin\" Class Metrics:")
print("\tMissClassification Rate:", 1-sklearn.metrics.accuracy_score(mandarin_df['native_language'], mandarin_df['predicted_language']))
print("\tPrecision:", precision[4])
print("\tRecall:", recall[4])
print("\tFscore:", fscore[4])

print("\n\"Polish\" Class Metrics:")
print("\tMissClassification Rate:", 1-sklearn.metrics.accuracy_score(polish_df['native_language'], polish_df['predicted_language']))
print("\tPrecision:", precision[5])
print("\tRecall:", recall[5])
print("\tFscore:", fscore[5])

print("\n\"Russian\" Class Metrics:")
print("\tMissClassification Rate:", 1-sklearn.metrics.accuracy_score(russian_df['native_language'], russian_df['predicted_language']))
print("\tPrecision:", precision[6])
print("\tRecall:", recall[6])
print("\tFscore:", fscore[6])

print("\n\"Spanish\" Class Metrics:")
print("\tMissClassification Rate:", 1-sklearn.metrics.accuracy_score(spanish_df['native_language'], spanish_df['predicted_language']))
print("\tPrecision:", precision[7])
print("\tRecall:", recall[7])
print("\tFscore:", fscore[7])

print("\n\"Thai\" Class Metrics:")
print("\tMissClassification Rate:", 1-sklearn.metrics.accuracy_score(thai_df['native_language'], thai_df['predicted_language']))
print("\tPrecision:", precision[8])
print("\tRecall:", recall[8])
print("\tFscore:", fscore[8])

print("\n\"Vietnamese\" Class Metrics:")
print("\tMissClassification Rate:", 1-sklearn.metrics.accuracy_score(vietnamese_df['native_language'], vietnamese_df['predicted_language']))
print("\tPrecision:", precision[9])
print("\tRecall:", recall[9])
print("\tFscore:", fscore[9])

"Arabic" Class Metrics:
	MissClassification Rate: 0.505
	Precision: 0.49748743718592964
	Recall: 0.495
	Fscore: 0.4962406015037594

"Cantonese" Class Metrics:
	MissClassification Rate: 0.6599999999999999
	Precision: 0.3269230769230769
	Recall: 0.34
	Fscore: 0.3333333333333333

"Japanese" Class Metrics:
	MissClassification Rate: 0.505
	Precision: 0.4604651162790698
	Recall: 0.495
	Fscore: 0.47710843373493983

"Korean" Class Metrics:
	MissClassification Rate: 0.5449999999999999
	Precision: 0.48404255319148937
	Recall: 0.455
	Fscore: 0.46907216494845355

"Mandarin" Class Metrics:
	MissClassification Rate: 0.685
	Precision: 0.3181818181818182
	Recall: 0.315
	Fscore: 0.3165829145728643

"Polish" Class Metrics:
	MissClassification Rate: 0.49
	Precision: 0.4700460829493088
	Recall: 0.51
	Fscore: 0.48920863309352525

"Russian" Class Metrics:
	MissClassification Rate: 0.43000000000000005
	Precision: 0.5135135135135135
	Recall: 0.57
	Fscore: 0.5402843601895734

"Spanish" Class Metrics:
	MissClas