# Import packages and data

In [67]:
#!pip install fasttext
import pandas as pd
import fasttext
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import csv
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import numpy as np

In [68]:
train = pd.read_json('train_dataset.json',  orient="records", lines = True)
validation = pd.read_json('validation_dataset.json',  orient="records", lines = True)
test = pd.read_json('test_dataset.json',  orient="records", lines = True)

In [69]:
X_val = validation['text']

## Binary fasttext

In [70]:
#CONVERT TRAINING DATA INTO SUITABLE FORMAT FOR FASTTEXT
import copy

binary_train = copy.deepcopy(train)
binary_train['union_label_binary'] = binary_train['union_label_binary'].apply(lambda x: '__label__' + 'Not_Distorted' if x == 0 else '__label__' + 'Distorted')
binary_train = binary_train[['text', 'union_label_binary']]
binary_train['file_format'] = binary_train['union_label_binary'] + ' ' + binary_train['text']
binary_train_file = binary_train['file_format']
binary_train_file.to_csv('binary_train_fasttext.txt', sep = '\t', index = False, header = None, quoting=csv.QUOTE_NONE) #last parameter deletes unnecessary quote signs from text

#CONVERT VALIDATION DATA INTO SUITABLE FORMAT FOR FASTTEXT
binary_validation = copy.deepcopy(validation)
binary_validation['union_label_binary'] = binary_validation['union_label_binary'].apply(lambda x: '__label__' + 'Not_Distorted' if x == 0 else '__label__' + 'Distorted')
binary_validation = binary_validation[['text', 'union_label_binary']]
binary_validation['file_format'] = binary_validation['union_label_binary'] + ' ' + binary_validation['text']
binary_validation_file = binary_validation['file_format']
binary_validation_file.to_csv('binary_validation_fasttext.txt', sep = '\t', index = False, header = None, quoting=csv.QUOTE_NONE) #last parameter deletes unnecessary quote signs from text

#CONVERT VALIDATION DATA INTO SUITABLE FORMAT FOR FASTTEXT
binary_test = copy.deepcopy(test)
binary_test['union_label_binary'] = binary_test['union_label_binary'].apply(lambda x: '__label__' + 'Not_Distorted' if x == 0 else '__label__' + 'Distorted')
binary_test = binary_test[['text', 'union_label_binary']]
binary_test['file_format'] = binary_test['union_label_binary'] + ' ' + binary_test['text']
binary_test_file = binary_test['file_format']
binary_test_file.to_csv('binary_test_fasttext.txt', sep = '\t', index = False, header = None, quoting=csv.QUOTE_NONE) #last parameter deletes unnecessary quote signs from text

In [71]:
import statistics as stat
from statistics import mean

In [6]:
import time
#WITH 10-CYCLE FOR LOOP FOR GETTING AVG OF F1-SCORE (bc there's no oppotrunity to set random_seed)

#hyperparameter tuning
learning_rates = [0.1, 0.25, 0.5, 0.75, 1.0]
nr_epochs = [5, 10, 15, 20, 25, 50]

results = []
stds = []

start = time.time()

for learn in learning_rates:
    for epoch_ in nr_epochs:
        F_scores = []
        for i in range (0,10):
            fasttext_model = fasttext.train_supervised(input = 'binary_train_fasttext.txt',  lr = learn, epoch = epoch_)
            y_pred = binary_validation['text'].apply(lambda x: fasttext_model.predict(x)[0][0])
            F1_score = f1_score(binary_validation['union_label_binary'], y_pred, average = 'weighted')
            F_scores.append(F1_score)
        average = mean(F_scores)
        std = stat.stdev(F_scores)
        results.append((average, std, learn, epoch_))
        stds.append((std, learn, epoch_))
        
end = time.time()
print(end - start)

65.34230518341064


In [7]:
maxResult = False
for result in results:
    if (maxResult == False):
        maxResult = result
    else:
        if result[0] >= maxResult[0]:
            maxResult = result
print(maxResult)

(0.6446841971476563, 0.013623878613788894, 0.75, 10)


In [8]:
""""for result in results:
    print(result)"""

'"for result in results:\n    print(result)'

In [8]:
#MODEL TRAINING AND PREDICTION WITH BEST PARAMETERS ON TEST SET USING 10X RUN AVERAGE
F_scores = []

for i in range (0,10):
    fasttext_model = fasttext.train_supervised(input = 'binary_train_fasttext.txt',  lr = 0.75, epoch = 10)
    y_pred = binary_test['text'].apply(lambda x: fasttext_model.predict(x)[0][0])
    F_scores.append(f1_score(binary_test['union_label_binary'], y_pred, average = 'weighted'))
    
print('Max weighted F1_score is ' + str(max(F_scores)))
print('Mean weighted F1_score is ' + str(mean(F_scores)))

Max weighted F1_score is 0.6092105263157895
Mean weighted F1_score is 0.6092105263157895


In [9]:
#MODEL TRAINING AND PREDICTION WITH BEST PARAMETERS ON TEST SET (CAN CHANGE EVERYTIME YOU RUN)
fasttext_model = fasttext.train_supervised(input = 'binary_train_fasttext.txt',  lr = 0.75, epoch = 10)
y_pred = binary_test['text'].apply(lambda x: fasttext_model.predict(x)[0][0])
print(classification_report(binary_test['union_label_binary'], y_pred))
print(confusion_matrix(binary_test['union_label_binary'], y_pred))

                        precision    recall  f1-score   support

    __label__Distorted       0.71      0.83      0.76        35
__label__Not_Distorted       0.33      0.20      0.25        15

              accuracy                           0.64        50
             macro avg       0.52      0.51      0.51        50
          weighted avg       0.60      0.64      0.61        50

[[29  6]
 [12  3]]


## With pre-processing Binary Fasttext

In [10]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ligren\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
#source: https://machinelearningknowledge.ai/11-techniques-of-text-preprocessing-using-nltk-in-python/
#PREPROCESSING TRAINING SET
pre_train = copy.deepcopy(train)

#Lowercasing text
pre_train['text'] = pre_train['text'].str.lower()

#Removing extra whitespaces
def remove_whitespace(text):
    return  " ".join(text.split())

pre_train['text'] = pre_train['text'].apply(remove_whitespace)


w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

#Lemmatize
pre_train['text'] = pre_train.text.apply(lemmatize_text)

#join by whitespace
pre_train['text'] = pre_train['text'].apply(lambda x: " ".join(x))



#PREPROCESSING VALIDATION SET
pre_validation = copy.deepcopy(validation)

#Lowercasing text
pre_validation['text'] = pre_validation['text'].str.lower()

#Removing extra whitespaces 
def remove_whitespace(text):
    return  " ".join(text.split())

pre_validation['text'] = pre_validation['text'].apply(remove_whitespace)

#Lemmatize
pre_validation['text'] = pre_validation.text.apply(lemmatize_text)

#join by whitespace
pre_validation['text'] = pre_validation['text'].apply(lambda x: " ".join(x))




#PREPROCESSING TEST SET
pre_test = copy.deepcopy(test)

#Lowercasing text
pre_test['text'] = pre_test['text'].str.lower()

#Removing extra whitespaces
def remove_whitespace(text):
    return  " ".join(text.split())

pre_test['text'] = pre_test['text'].apply(remove_whitespace)

#Lemmatize
pre_test['text'] = pre_test.text.apply(lemmatize_text)

#join by whitespace
pre_test['text'] = pre_test['text'].apply(lambda x: " ".join(x))

In [12]:
#CONVERT TRAINING DATA INTO SUITABLE FORMAT FOR FASTTEXT
pre_train['union_label_binary'] = pre_train['union_label_binary'].apply(lambda x: '__label__' + 'Not_Distorted' if x == 0 else '__label__' + 'Distorted')
pre_train = pre_train[['text', 'union_label_binary']]
pre_train['file_format'] = pre_train['union_label_binary'] + ' ' + pre_train['text']
pre_train_file = pre_train['file_format']
pre_train_file.to_csv('PRE_binary_train_fasttext.txt', sep = '\t', index = False, header = None, quoting=csv.QUOTE_NONE) #last parameter deletes unnecessary quote signs from text



#CONVERT VALIDATION DATA INTO SUITABLE FORMAT FOR FASTTEXT
pre_validation['union_label_binary'] = pre_validation['union_label_binary'].apply(lambda x: '__label__' + 'Not_Distorted' if x == 0 else '__label__' + 'Distorted')
pre_validation = pre_validation[['text', 'union_label_binary']]
pre_validation['file_format'] = pre_validation['union_label_binary'] + ' ' + pre_validation['text']
pre_validation_file = pre_validation['file_format']
pre_validation_file.to_csv('PRE_binary_validation_fasttext.txt', sep = '\t', index = False, header = None, quoting=csv.QUOTE_NONE) #last parameter deletes unnecessary quote signs from text



#CONVERT TEST DATA INTO SUITABLE FORMAT FOR FASTTEXT
pre_test['union_label_binary'] = pre_test['union_label_binary'].apply(lambda x: '__label__' + 'Not_Distorted' if x == 0 else '__label__' + 'Distorted')
pre_test = pre_test[['text', 'union_label_binary']]
pre_test['file_format'] = pre_test['union_label_binary'] + ' ' + pre_test['text']
pre_test_file = pre_test['file_format']
pre_test_file.to_csv('PRE_binary_test_fasttext.txt', sep = '\t', index = False, header = None, quoting=csv.QUOTE_NONE) #last parameter deletes unnecessary quote signs from text

In [13]:
#WITH 10-CYCLE FOR LOOP FOR GETTING AVG OF F1-SCORE (bc there's no oppotrunity to set random_seed)

#hyperparameter tuning
learning_rates = [0.1, 0.25, 0.5, 0.75, 1.0]
nr_epochs = [5, 10, 15, 20, 25, 50]

results = []

start = time.time()

for learn in learning_rates:
    for epoch_ in nr_epochs:
        F_scores = []
        for i in range (0,10):
            fasttext_model = fasttext.train_supervised(input = 'PRE_binary_train_fasttext.txt',  lr = learn, epoch = epoch_)
            y_pred = pre_validation['text'].apply(lambda x: fasttext_model.predict(x)[0][0])
            F1_score = f1_score(pre_validation['union_label_binary'], y_pred, average = 'weighted')
            F_scores.append(F1_score)
        average = mean(F_scores)
        results.append((average, learn, epoch_))
        
end = time.time()
print(end - start)

64.45967102050781


In [14]:
maxResult = False
for result in results:
  if (maxResult == False):
    maxResult = result
  else:
    if result[0] >= maxResult[0]:
      maxResult = result
print(maxResult)

(0.6447640915312902, 0.5, 15)


In [15]:
#MODEL TRAINING AND PREDICTION WITH BEST PARAMETERS ON TEST SET USING 10X RUN AVERAGE
F_scores = []

for i in range (0,10):
    fasttext_model = fasttext.train_supervised(input = 'PRE_binary_train_fasttext.txt',  lr = 0.5, epoch = 15)
    y_pred = pre_test['text'].apply(lambda x: fasttext_model.predict(x)[0][0])
    F_scores.append(f1_score(pre_test['union_label_binary'], y_pred, average = 'weighted'))
    
print('Max weighted F1_score is ' + str(max(F_scores)))
print('Mean weighted F1_score is ' + str(mean(F_scores)))

Max weighted F1_score is 0.6373333333333333
Mean weighted F1_score is 0.6085100156404504


In [16]:
#MODEL TRAINING AND PREDICTION WITH BEST PARAMETERS ON TEST SET (CAN CHANGE EVERYTIME YOU RUN)
fasttext_model = fasttext.train_supervised(input = 'PRE_binary_train_fasttext.txt',  lr = 0.5, epoch = 15)
y_pred = pre_test['text'].apply(lambda x: fasttext_model.predict(x)[0][0])
print(classification_report(pre_test['union_label_binary'], y_pred))
print(confusion_matrix(pre_test['union_label_binary'], y_pred))

                        precision    recall  f1-score   support

    __label__Distorted       0.71      0.86      0.78        35
__label__Not_Distorted       0.38      0.20      0.26        15

              accuracy                           0.66        50
             macro avg       0.54      0.53      0.52        50
          weighted avg       0.61      0.66      0.62        50

[[30  5]
 [12  3]]


## WITH PRETRAINED WORD EMBEDDINGS

In [29]:
#!unzip wiki-news-300d-1M-subword.vec.zip -d wiki-news-300d-1M-subword.vec

### Binary pre-trained vectors

In [61]:
#Hyperparameter tuning 
#takes approx 17 minutes
#If I would use 10x for loop for getting 10x avg per each parameter combination, the training will take 3hours.
#So, 10x for loop will not be used here for the sake of time
learning_rates = [0.1, 0.25, 0.5, 0.75, 1.0]
nr_epochs = [5, 10, 15, 20, 25, 50]
#min_lengths = [1,2,3]
results = []

start = time.time()

for learn in learning_rates:
    for epoch_ in nr_epochs:
            fasttext_model = fasttext.train_supervised(input = 'binary_train_fasttext.txt', lr = learn, epoch = epoch_, dim = 300, pretrainedVectors='wiki-news-300d-1M-subword.vec')
            y_pred = binary_validation['text'].apply(lambda x: fasttext_model.predict(x)[0][0])
            F1_score = f1_score(binary_validation['union_label_binary'], y_pred, average = 'weighted')
            F_scores.append(F1_score)
            average = mean(F_scores)
            results.append((average, learn, epoch_))

end = time.time()
print(end-start)

maxResult = False
for result in results:
  if (maxResult == False):
    maxResult = result
  else:
    if result[0] >= maxResult[0]:
      maxResult = result
print(maxResult)

1072.471356868744
(0.5879416823214292, 0.25, 10)


In [65]:
#MODEL TRAINING AND PREDICTION WITH BEST PARAMETERS ON TEST SET USING 10X RUN AVERAGE
#It takes around

F_scores = []

start = time.time()
for i in range (0,10):
    fasttext_model = fasttext.train_supervised(input = 'binary_train_fasttext.txt',  lr = 0.25, epoch = 10, dim = 300, pretrainedVectors='wiki-news-300d-1M-subword.vec')
    y_pred = binary_test['text'].apply(lambda x: fasttext_model.predict(x)[0][0])
    F_scores.append(f1_score(binary_test['union_label_binary'], y_pred, average = 'weighted'))

end = time.time()
print(end-start)

print('Max weighted F1_score is ' + str(max(F_scores)))
print('Mean weighted F1_score is ' + str(mean(F_scores)))

335.96571040153503
Max weighted F1_score is 0.5800415800415799
Mean weighted F1_score is 0.5800415800415799


In [66]:
#with best parameters evaluation (may change with each run)
fasttext_model = fasttext.train_supervised(input = 'binary_train_fasttext.txt',  lr = 0.25, epoch = 10, dim = 300, pretrainedVectors='wiki-news-300d-1M-subword.vec')
y_pred = binary_test['text'].apply(lambda x: fasttext_model.predict(x)[0][0])
print(classification_report(binary_test['union_label_binary'], y_pred))
print(confusion_matrix(binary_test['union_label_binary'], y_pred))

                        precision    recall  f1-score   support

    __label__Distorted       0.69      0.77      0.73        35
__label__Not_Distorted       0.27      0.20      0.23        15

              accuracy                           0.60        50
             macro avg       0.48      0.49      0.48        50
          weighted avg       0.57      0.60      0.58        50

[[27  8]
 [12  3]]


In [None]:
#Multi-label classification was also tried but did not work as expected, so this is not included in this file.
#File can be provided to reviewer of thesis by request.