## LOADING LIBRARIES AND DATA

In [None]:
import pandas as pd
import numpy as np
import copy
import time
import nltk
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics import f1_score, make_scorer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
train = pd.read_json('train_dataset.json',  orient="records", lines = True)
validation = pd.read_json('validation_dataset.json',  orient="records", lines = True)
test = pd.read_json('test_dataset.json',  orient="records", lines = True)

In [None]:
train.head(5)

Unnamed: 0,text,union_label,union_label_binary,Arbitrary Inference,Black and White Thinking,Catastrophizing,Labeling,Overgeneralization,Personalization,Selective Abstraction
0,"I love school, getting to see friends everyday...",[Not Distorted],0,0,0,0,0,0,0,0
1,Hey guys. So I was diagnosed with Panic disor...,[Selective Abstraction],1,0,0,0,0,0,0,1
2,I'm not sure if this is the right place for th...,[Not Distorted],0,0,0,0,0,0,0,0
3,You are not the boss of me. Had I written thi...,[Not Distorted],0,0,0,0,0,0,0,0
4,This is literally all I think about. I frequen...,[Not Distorted],0,0,0,0,0,0,0,0


## BINARY LOGISTIC REGRESSION WITH GRID SEARCH

In [None]:
#GRID SEARCH CV FOR PARAMETER TUNING 
pipeline = Pipeline([
                  ('vect', CountVectorizer()), 
                  ('tfidf', TfidfTransformer()), 
                  ('logreg', LogisticRegression(solver='liblinear', random_state=1))
                  ])

parameters = {
              'logreg__C': [0.01,0.03,0.05,0.1,0.3,0.5,1,3,5,10,30,50],
              'vect__max_features': [5000,4000,3000,2000,1000,500,5],
              'vect__min_df': [1,3,5,10]
              #'vect__min_df': [0.01, 0.1, 0.25, 0.5, 0.75, 1.0]
              }

clf = GridSearchCV(pipeline, param_grid=parameters, scoring='f1_weighted')

start = time.time()
clf.fit(train['text'], train['union_label_binary'])
end = time.time()

print(end-start)
print('Best Score: %s' % clf.best_score_)
print('Best Hyperparameters: %s' % clf.best_params_)

#print("\n")
#preds = clf.best_estimator_.predict(validation['text'])
#print(classification_report(validation['union_label_binary'],preds))

87.9687922000885
Best Score: 0.675900166929008
Best Hyperparameters: {'logreg__C': 10, 'vect__max_features': 5000, 'vect__min_df': 10}


In [None]:
#PREDICTIONS ON TEST DATA
pipeline = Pipeline([
                  ('vect', CountVectorizer(max_features = 5000, min_df = 10)), 
                  ('tfidf', TfidfTransformer()), 
                  ('logreg', LogisticRegression(solver='liblinear', random_state=1, C = 10))
                  ])

start = time.time()
pipeline.fit(train['text'], train['union_label_binary'])
preds = pipeline.predict(test['text'])
end = time.time()

print(end-start)
print(classification_report(test['union_label_binary'],preds))

0.09527397155761719
              precision    recall  f1-score   support

           0       0.46      0.40      0.43        15
           1       0.76      0.80      0.78        35

    accuracy                           0.68        50
   macro avg       0.61      0.60      0.60        50
weighted avg       0.67      0.68      0.67        50



# MULTI-LABEL LOGISTIC REGRESSION

In [None]:
#MANUAL HYPERPARAMETER TUNING ON VALIDATION SET (Grid search can also be used, but here we use the whole dataset available)

results = []

logreg__C = [0.01,0.03,0.05,0.1,0.3,0.5,1,3,5,10,30,50]
vect_max_features = [5000,4000,3000,2000,1000,500,5]
vect__min_df = [1,3,5,10]

start = time.time()
for C_ in logreg__C:
  for max in vect_max_features:
    for min in vect__min_df:

      vectorizer = TfidfVectorizer(min_df=min, max_features = max) #strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2'
      vectorizer.fit(train['text'])
      X_train_transf = vectorizer.transform(train['text'])
      X_val_transf = vectorizer.transform(validation['text'])

      clf = MultiOutputClassifier(LogisticRegression(random_state=1, C=C_, solver='liblinear')).fit(X_train_transf, train.iloc[:, [3,4,5,6,7,8,9]])
      predictions = clf.predict(X_val_transf)

      F1_score = f1_score(validation.iloc[:, [3,4,5,6,7,8,9]], predictions, average = 'weighted')
      results.append((F1_score, C_, max, min))
end = time.time()

print(end-start)

61.89990472793579


In [None]:
#Find the best model and its hyperparameters according to the highest F1-score

maxResult = False
for result in results:
  if (maxResult == False):
    maxResult = result
  else:
    if result[0] >= maxResult[0]:
      maxResult = result
print(maxResult)

(0.20243864793949687, 50, 500, 10)


In [None]:
#PREDICTIONS ON TEST DATA WITH THE BEST PARAMETERS
vectorizer = TfidfVectorizer(min_df=10, max_features = 500) #strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2'
vectorizer.fit(train['text'])

X_train_transf = vectorizer.transform(train['text'])
X_test_transf = vectorizer.transform(test['text'])

start = time.time()
clf = MultiOutputClassifier(LogisticRegression(random_state=1, C=50, solver='liblinear')).fit(X_train_transf, train.iloc[:, [3,4,5,6,7,8,9]])
predictions = clf.predict(X_test_transf)
end = time.time()

print(end-start)
print('AUC score: {}'.format(roc_auc_score(test.iloc[:, [3,4,5,6,7,8,9]],predictions)))
print('\n')

label_names = ['Arbitrary Inference',	'Black and White Thinking',	'Catastrophizing',	'Labeling',	'Overgeneralization',	'Personalization',	'Selective Abstraction']

print(classification_report(test.iloc[:, [3,4,5,6,7,8,9]], predictions, target_names=label_names))

0.03837275505065918
AUC score: 0.5068271046329558


                          precision    recall  f1-score   support

     Arbitrary Inference       0.22      0.20      0.21        10
Black and White Thinking       0.00      0.00      0.00         3
         Catastrophizing       0.30      0.17      0.21        18
                Labeling       0.50      0.17      0.25         6
      Overgeneralization       0.00      0.00      0.00         4
         Personalization       0.00      0.00      0.00         3
   Selective Abstraction       0.00      0.00      0.00         6

               micro avg       0.27      0.12      0.17        50
               macro avg       0.15      0.08      0.10        50
            weighted avg       0.21      0.12      0.15        50
             samples avg       0.10      0.09      0.09        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# WITH PRE-PROCESSING BINARY LOGISTIC REGRESSION

In [None]:
#Pre-processing Source: https://machinelearningknowledge.ai/11-techniques-of-text-preprocessing-using-nltk-in-python/
 
lemmatizer = WordNetLemmatizer()

#PREPROCESSING TRAINING SET
pre_train = copy.deepcopy(train)

#Lowercasing text
pre_train['text'] = pre_train['text'].str.lower()

#Removing extra whitespaces 
def remove_whitespace(text):
    return  " ".join(text.split())

pre_train['text'] = pre_train['text'].apply(remove_whitespace)

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

#Lemmatize
pre_train['text'] = pre_train.text.apply(lemmatize_text)

#join by whitespace
pre_train['text'] = pre_train['text'].apply(lambda x: " ".join(x))

In [None]:
#PREPROCESSING VALIDATION SET
pre_validation = copy.deepcopy(validation)

#Lowercasing text
pre_validation['text'] = pre_validation['text'].str.lower()

#Removing extra whitespaces 
def remove_whitespace(text):
    return  " ".join(text.split())

pre_validation['text'] = pre_validation['text'].apply(remove_whitespace)

#Lemmatize
pre_validation['text'] = pre_validation.text.apply(lemmatize_text)

#join by whitespace
pre_validation['text'] = pre_validation['text'].apply(lambda x: " ".join(x))

In [None]:
#PREPROCESSING TEST SET
pre_test = copy.deepcopy(test)

#Lowercasing text
pre_test['text'] = pre_test['text'].str.lower()

#Removing extra whitespaces 
def remove_whitespace(text):
    return  " ".join(text.split())

pre_test['text'] = pre_test['text'].apply(remove_whitespace)

#Lemmatize
pre_test['text'] = pre_test.text.apply(lemmatize_text)

#join by whitespace
pre_test['text'] = pre_test['text'].apply(lambda x: " ".join(x))

In [None]:
#GRID SEARCH CV FOR PARAMETER TUNING 
pipeline = Pipeline([
                  ('vect', CountVectorizer()), 
                  ('tfidf', TfidfTransformer()), 
                  ('logreg', LogisticRegression(solver='liblinear', random_state=1))
                  ])

parameters = {
              'logreg__C': [0.01,0.03,0.05,0.1,0.3,0.5,1,3,5,10,30,50],
              'vect__max_features': [5000,4000,3000,2000,1000,500,5],
              'vect__min_df': [1,3,5,10]
              }

clf = GridSearchCV(pipeline, param_grid=parameters, scoring = 'f1_weighted')

start = time.time()
clf.fit(pre_train['text'], pre_train['union_label_binary'])
end = time.time()

print(end-start)
print('Best Score: %s' % clf.best_score_)
print('Best Hyperparameters: %s' % clf.best_params_)
#print("\n")
#print(classification_report(pre_validation['union_label_binary'],clf.best_estimator_.predict(pre_validation['text'])))

104.17226982116699
Best Score: 0.6759083500550662
Best Hyperparameters: {'logreg__C': 30, 'vect__max_features': 5000, 'vect__min_df': 10}


In [None]:
#PREDICTIONS ON TEST DATA
pipeline = Pipeline([
                  ('vect', CountVectorizer(max_features = 5000, min_df = 10)), 
                  ('tfidf', TfidfTransformer()), 
                  ('logreg', LogisticRegression(solver='liblinear', random_state=1, C = 30))
                  ])

start = time.time()
pipeline.fit(pre_train['text'], pre_train['union_label_binary'])
preds = pipeline.predict(pre_test['text'])
end = time.time()

print(end-start)
print(classification_report(pre_test['union_label_binary'],preds))

0.057353973388671875
              precision    recall  f1-score   support

           0       0.50      0.47      0.48        15
           1       0.78      0.80      0.79        35

    accuracy                           0.70        50
   macro avg       0.64      0.63      0.64        50
weighted avg       0.69      0.70      0.70        50



# WITH PRE-PROCESSING MULTI-LABEL LOGISTIC REGRESSION

In [None]:
#MANUAL HYPERPARAMETER TUNING ON VALIDATION SET

results = []

logreg__C = [0.01,0.03,0.05,0.1,0.3,0.5,1,3,5,10,30,50]
vect_max_features = [5000,4000,3000,2000,1000,500,5]
vect__min_df = [1,3,5,10]

start = time.time()
for C_ in logreg__C:
  for max in vect_max_features:
    for min in vect__min_df:

      vectorizer = TfidfVectorizer(min_df=min, max_features = max) #strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2'
      vectorizer.fit(pre_train['text'])
      X_train_transf = vectorizer.transform(pre_train['text'])
      X_val_transf = vectorizer.transform(pre_validation['text'])

      clf = MultiOutputClassifier(LogisticRegression(random_state=1, C=C_, solver='liblinear')).fit(X_train_transf, pre_train.iloc[:, [3,4,5,6,7,8,9]])
      predictions = clf.predict(X_val_transf)

      F1_score = f1_score(pre_validation.iloc[:, [3,4,5,6,7,8,9]], predictions, average = 'weighted')
      results.append((F1_score, C_, max, min))
end = time.time()

print(end-start)

36.36245369911194


In [None]:
#Find the best model and its hyperparameters according to the highest F1-score

maxResult = False
for result in results:
  if (maxResult == False):
    maxResult = result
  else:
    if result[0] >= maxResult[0]:
      maxResult = result
print(maxResult)

(0.2066737090660536, 50, 500, 3)


In [None]:
#PREDICTIONS ON TEST SET

vectorizer = TfidfVectorizer(min_df=3, max_features = 500) #strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2'
vectorizer.fit(pre_train['text'])

X_train_transf = vectorizer.transform(pre_train['text'])
X_test_transf = vectorizer.transform(pre_test['text'])

start = time.time()
clf = MultiOutputClassifier(LogisticRegression(random_state=1, C=50, solver='liblinear')).fit(X_train_transf, pre_train.iloc[:, [3,4,5,6,7,8,9]])
predictions = clf.predict(X_test_transf)
end = time.time()

print(end-start)
print('AUC score: {}'.format(roc_auc_score(pre_test.iloc[:, [3,4,5,6,7,8,9]],predictions)))
print('\n')

label_names = ['Arbitrary Inference',	'Black and White Thinking',	'Catastrophizing',	'Labeling',	'Overgeneralization',	'Personalization',	'Selective Abstraction']

print(classification_report(pre_test.iloc[:, [3,4,5,6,7,8,9]], predictions, target_names=label_names))

0.06790876388549805
AUC score: 0.5150295700162721


                          precision    recall  f1-score   support

     Arbitrary Inference       0.25      0.20      0.22        10
Black and White Thinking       0.00      0.00      0.00         3
         Catastrophizing       0.42      0.28      0.33        18
                Labeling       0.50      0.17      0.25         6
      Overgeneralization       0.00      0.00      0.00         4
         Personalization       0.00      0.00      0.00         3
   Selective Abstraction       0.00      0.00      0.00         6

               micro avg       0.33      0.16      0.22        50
               macro avg       0.17      0.09      0.12        50
            weighted avg       0.26      0.16      0.19        50
             samples avg       0.16      0.12      0.13        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
