## LIBRARIES AND LOADING DATA

In [None]:
import pandas as pd
import numpy as np
import copy
import json
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
import nltk
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import time

In [None]:
train = pd.read_json('train_dataset.json',  orient="records", lines = True)
validation = pd.read_json('validation_dataset.json',  orient="records", lines = True)
test = pd.read_json('test_dataset.json',  orient="records", lines = True)

## BINARY CLASSIFICATION

In [None]:
#GRID SEARCH CV FOR PARAMETER TUNING (TRAIN+VALIDATION)
pipeline = Pipeline([
                  ('vect', CountVectorizer()), 
                  ('tfidf', TfidfTransformer()), 
                  ('svc', SVC(random_state = 1))
                  ])

parameters = {
              'vect__max_features': [5000,4000,3000,2000,1000,500,5],
              'vect__min_df': [1,3,5,10],
              'svc__C': [0.1, 1, 10, 100, 1000],
              'svc__gamma': [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001],
              'svc__kernel': ['linear', 'rbf'] #, 'poly', 'sigmoid', 'precomputed', 'rbf']
              }

clf = GridSearchCV(pipeline, param_grid=parameters, scoring='f1_weighted')

start = time.time()
clf.fit(train['text'], train['union_label_binary'])
end = time.time()

print(end-start)
print('Best Score: %s' % clf.best_score_)
print('Best Hyperparameters: %s' % clf.best_params_)
#print("\n")
#print(classification_report(validation['union_label_binary'],clf.best_estimator_.predict(validation['text'])))

1539.0721416473389
Best Score: 0.6700548066263969
Best Hyperparameters: {'svc__C': 10, 'svc__gamma': 0.1, 'svc__kernel': 'rbf', 'vect__max_features': 500, 'vect__min_df': 5}


In [None]:
#PREDICTIONS ON TEST SET
pipeline = Pipeline([
                  ('vect', CountVectorizer(min_df=5, max_features=500)), 
                  ('tfidf', TfidfTransformer()), 
                  ('svc', SVC(random_state = 1, gamma=0.1, C=10, kernel='rbf'))
                  ])
start = time.time()
pipeline.fit(train['text'], train['union_label_binary'])
preds = pipeline.predict(test['text'])
end = time.time()

print(end-start)
print(classification_report(test['union_label_binary'],preds))

0.22949862480163574
              precision    recall  f1-score   support

           0       0.50      0.40      0.44        15
           1       0.76      0.83      0.79        35

    accuracy                           0.70        50
   macro avg       0.63      0.61      0.62        50
weighted avg       0.68      0.70      0.69        50



## BINARY CLASSIFICATION WITH PRE-PROCESSING

In [None]:
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
#Source: https://machinelearningknowledge.ai/11-techniques-of-text-preprocessing-using-nltk-in-python/

#PREPROCESSING TRAINING SET
pre_train = copy.deepcopy(train)

#Lowercasing text
pre_train['text'] = pre_train['text'].str.lower()

#Removing extra whitespaces
def remove_whitespace(text):
    return  " ".join(text.split())

pre_train['text'] = pre_train['text'].apply(remove_whitespace)

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

#Lemmatize
pre_train['text'] = pre_train.text.apply(lemmatize_text)

#join by whitespace
pre_train['text'] = pre_train['text'].apply(lambda x: " ".join(x))

In [None]:
#PREPROCESSING VALIDATION SET
pre_validation = copy.deepcopy(validation)

#Lowercasing text
pre_validation['text'] = pre_validation['text'].str.lower()

#Removing extra whitespaces
def remove_whitespace(text):
    return  " ".join(text.split())

pre_validation['text'] = pre_validation['text'].apply(remove_whitespace)

#Lemmatize
pre_validation['text'] = pre_validation.text.apply(lemmatize_text)

#join by whitespace
pre_validation['text'] = pre_validation['text'].apply(lambda x: " ".join(x))

In [None]:
#PREPROCESSING VALIDATION SET
pre_test = copy.deepcopy(test)

#Lowercasing text
pre_test['text'] = pre_test['text'].str.lower()

#Removing extra whitespaces 
def remove_whitespace(text):
    return  " ".join(text.split())

pre_test['text'] = pre_test['text'].apply(remove_whitespace)

#Lemmatize
pre_test['text'] = pre_test.text.apply(lemmatize_text)

#join by whitespace
pre_test['text'] = pre_test['text'].apply(lambda x: " ".join(x))

In [None]:
#GRID SEARCH CV FOR PARAMETER TUNING (TRAIN+VALIDATION)
pipeline = Pipeline([
                  ('vect', CountVectorizer()), 
                  ('tfidf', TfidfTransformer()), 
                  ('svc', SVC(random_state = 1))
                  ])

parameters = {
              'vect__max_features': [5000,4000,3000,2000,1000,500,5],
              'vect__min_df': [1,3,5,10],
              'svc__C': [0.1, 1, 10, 100, 1000],
              'svc__gamma': [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001],
              'svc__kernel': ['linear', 'rbf'] #, 'poly', 'sigmoid', 'precomputed', 'rbf']
              }

clf = GridSearchCV(pipeline, param_grid=parameters, scoring='f1_weighted')

start = time.time()
clf.fit(pre_train['text'], pre_train['union_label_binary'])
end = time.time()

print(end-start)
print('Best Score: %s' % clf.best_score_)
print('Best Hyperparameters: %s' % clf.best_params_)
#print("\n")
#print(classification_report(pre_validation['union_label_binary'],clf.best_estimator_.predict(pre_validation['text'])))

1525.175509929657
Best Score: 0.6779693469131438
Best Hyperparameters: {'svc__C': 10, 'svc__gamma': 0.1, 'svc__kernel': 'rbf', 'vect__max_features': 5000, 'vect__min_df': 10}


In [None]:
#PREDICTIONS ON TEST SET
pipeline = Pipeline([
                  ('vect', CountVectorizer(min_df=10, max_features=5000)), 
                  ('tfidf', TfidfTransformer()), 
                  ('svc', SVC(random_state = 1, gamma=0.1, C=10, kernel='rbf'))
                  ])

start = time.time()
pipeline.fit(pre_train['text'], pre_train['union_label_binary'])
preds = pipeline.predict(pre_test['text'])
end = time.time()

print(end-start)
print(classification_report(pre_test['union_label_binary'],preds))

0.21779561042785645
              precision    recall  f1-score   support

           0       0.50      0.40      0.44        15
           1       0.76      0.83      0.79        35

    accuracy                           0.70        50
   macro avg       0.63      0.61      0.62        50
weighted avg       0.68      0.70      0.69        50



## MULTI-LABEL CLASSIFICATION

In [None]:
#HYPERPARAMETER TUNING ON VALIDATION SET

results = []

vect__max_features = [5000,4000,3000,2000,1000,500,5]
vect__min_df = [1,3,5,10]
svc__C = [0.1, 1, 10, 100, 1000]
svc__gamma = [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]
svc__kernel = ['linear', 'rbf']

start = time.time()
for max_ in vect__max_features:
  for min_ in vect__min_df:
    for C_ in svc__C:
      for gamma_ in svc__gamma:
        for kernel_ in svc__kernel:
          vectorizer = TfidfVectorizer(min_df=min_, max_features=max_)  
          vectorizer.fit(train['text'])
          X_train_transf = vectorizer.transform(train['text'])
          X_val_transf = vectorizer.transform(validation['text'])
          clf = MultiOutputClassifier(SVC(random_state=1, C=C_, kernel=kernel_, gamma = gamma_)).fit(X_train_transf, train.iloc[:, [3,4,5,6,7,8,9]])
          predictions = clf.predict(X_val_transf)
          F1_score = f1_score(validation.iloc[:, [3,4,5,6,7,8,9]], predictions, average = 'weighted')
          results.append((F1_score, max_, min_, C_, gamma_, kernel_))
end = time.time()

print(end-start)

1576.9810464382172


In [None]:
#FIND BEST F1-SCORE ALONG WITH BEST PARAMETERS
maxResult = False
for result in results:
  if (maxResult == False):
    maxResult = result
  else:
    if result[0] >= maxResult[0]:
      maxResult = result
print(maxResult)

(0.26559211539448696, 1000, 10, 10, 0.0001, 'linear')


In [None]:
#MAKE PREDICTIONS ON TEST SET WITH THE BEST PARAMETERS

vectorizer = TfidfVectorizer(max_features=1000, min_df=10)  
vectorizer.fit(train['text'])

X_train_transf = vectorizer.transform(train['text'])
X_test_transf = vectorizer.transform(test['text'])

start = time.time()
clf = MultiOutputClassifier(SVC(C=10, kernel='linear')).fit(X_train_transf, train.iloc[:, [3,4,5,6,7,8,9]])
predictions = clf.predict(X_test_transf)
end = time.time()

print(end-start)
print('AUC score: {}'.format(roc_auc_score(test.iloc[:, [3,4,5,6,7,8,9]],predictions)))
print('\n')

label_names = ['Arbitrary Inference',	'Black and White Thinking',	'Catastrophizing',	'Labeling',	'Overgeneralization',	'Personalization',	'Selective Abstraction']

print(classification_report(test.iloc[:, [3,4,5,6,7,8,9]], predictions, target_names=label_names))

0.4835793972015381
AUC score: 0.5663611134471449


                          precision    recall  f1-score   support

     Arbitrary Inference       0.29      0.40      0.33        10
Black and White Thinking       0.00      0.00      0.00         3
         Catastrophizing       0.27      0.17      0.21        18
                Labeling       0.25      0.17      0.20         6
      Overgeneralization       0.33      0.25      0.29         4
         Personalization       0.40      0.67      0.50         3
   Selective Abstraction       0.00      0.00      0.00         6

               micro avg       0.28      0.22      0.25        50
               macro avg       0.22      0.24      0.22        50
            weighted avg       0.24      0.22      0.22        50
             samples avg       0.16      0.15      0.14        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## MULTI-LABEL CLASSIFICATION WITH PRE-PROCESSING

In [None]:
#HYPERPARAMETER TUNING ON VALIDATION SET

results = []

vect__max_features = [5000,4000,3000,2000,1000,500,5]
vect__min_df = [1,3,5,10]
svc__C = [0.1, 1, 10, 100, 1000]
svc__gamma = [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]
svc__kernel = ['linear', 'rbf']

start = time.time()
for max_ in vect__max_features:
  for min_ in vect__min_df:
    for C_ in svc__C:
      for gamma_ in svc__gamma:
        for kernel_ in svc__kernel:
          vectorizer = TfidfVectorizer(min_df=min_, max_features=max_)  
          vectorizer.fit(pre_train['text'])
          X_train_transf = vectorizer.transform(pre_train['text'])
          X_val_transf = vectorizer.transform(pre_validation['text'])
          clf = MultiOutputClassifier(SVC(random_state=1, C=C_, kernel=kernel_, gamma = gamma_)).fit(X_train_transf, pre_train.iloc[:, [3,4,5,6,7,8,9]])
          predictions = clf.predict(X_val_transf)
          F1_score = f1_score(pre_validation.iloc[:, [3,4,5,6,7,8,9]], predictions, average = 'weighted')
          results.append((F1_score, max_, min_, C_, gamma_, kernel_))
end = time.time()

print(end-start)

1598.324913740158


In [None]:
#FIND BEST F1-SCORE ALONG WITH BEST PARAMETERS

maxResult = False
for result in results:
  if (maxResult == False):
    maxResult = result
  else:
    if result[0] >= maxResult[0]:
      maxResult = result
print(maxResult)

(0.2744588744588745, 500, 3, 1000, 0.1, 'rbf')


In [None]:
#MAKE PREDICTIONS ON TEST SET WITH THE BEST PARAMETERS

vectorizer = TfidfVectorizer(max_features=500, min_df=3)  
vectorizer.fit(pre_train['text'])

X_train_transf = vectorizer.transform(pre_train['text'])
X_test_transf = vectorizer.transform(pre_test['text'])

start = time.time()
clf = MultiOutputClassifier(SVC(C=1000, kernel='rbf', gamma = 0.1)).fit(X_train_transf, train.iloc[:, [3,4,5,6,7,8,9]])
predictions = clf.predict(X_test_transf)
end = time.time()

print(end-start)
print('AUC score: {}'.format(roc_auc_score(pre_test.iloc[:, [3,4,5,6,7,8,9]],predictions)))
print('\n')

label_names = ['Arbitrary Inference',	'Black and White Thinking',	'Catastrophizing',	'Labeling',	'Overgeneralization',	'Personalization',	'Selective Abstraction']

print(classification_report(pre_test.iloc[:, [3,4,5,6,7,8,9]], predictions, target_names=label_names))

0.5074355602264404
AUC score: 0.5381279842831647


                          precision    recall  f1-score   support

     Arbitrary Inference       0.33      0.40      0.36        10
Black and White Thinking       0.00      0.00      0.00         3
         Catastrophizing       0.33      0.28      0.30        18
                Labeling       0.33      0.17      0.22         6
      Overgeneralization       0.00      0.00      0.00         4
         Personalization       0.33      0.33      0.33         3
   Selective Abstraction       0.00      0.00      0.00         6

               micro avg       0.31      0.22      0.26        50
               macro avg       0.19      0.17      0.17        50
            weighted avg       0.25      0.22      0.23        50
             samples avg       0.19      0.16      0.16        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
