In [12]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, RandomizedSearchCV, RepeatedKFold
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from gensim.models import Word2Vec

# XGBoost

As second traditional machine learner, we wanted to see if an ensemble method would perform even better than an SVM. Here, we chose XGBoost due to its good performance for binary classification

# Reading the data

In the provided 'Preprocessing' notebook, the data was already preprocessed. In the modelling of the SVM, it already was determined that keeping uppercased words in the corpus increased the SVM's abilty to predict if someone is extraverted. Therefore, we decided to also use this uppercased corpus for the other machine learning method as well.

In [13]:
df = pd.read_csv('processed_data_english_no_lowercasing.csv')
df

Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,Well right woke midday nap Its sort weird ever...,0,1,1,0,1
1,Well stream consciousness essay used thing lik...,0,0,1,0,0
2,open keyboard button push The thing finally wo...,0,1,0,1,1
3,cant believe Its really happening pulse racing...,1,0,1,1,0
4,Well good old stream consciousness assignment ...,1,0,1,0,1
...,...,...,...,...,...,...
2958,motivated day day basis need provide little fa...,1,0,0,1,1
2959,son biggest part life without reckless person ...,1,1,0,0,0
2960,kid grandkids keep motivated everyday inspire ...,1,0,1,1,0
2961,biggest drive earn money retire beach schedule...,0,0,0,0,0


As the data in the dataframe is one big string, the strings are tokenized and put into a list for further processing

In [14]:
tokenized_texts = []
for text in df['TEXT']:
    token = word_tokenize(text)
    tokenized_texts.append(token)
    
corpus = df['TEXT'].tolist()

# Modelling 

As with the SVM, 3 different feeding techniques were considered: TF-IDF, Part-of-Speech (PoS) Tagging and Bag-of-Words (BoW). In the remainder of the notebook, each method will be used and will be compared at the end of the notebook. 

## Using TF-IDF 

First, train-test splits are made per target variable. Afterwards, X_train and X_test are transformed using TF-IDF

In [15]:
# Ext split
X_train_ext, X_test_ext, y_train_ext, y_test_ext = train_test_split(corpus, df['cEXT'].tolist(), test_size=0.2, random_state=42)

# Neu split
X_train_neu, X_test_neu, y_train_neu, y_test_neu = train_test_split(corpus, df['cNEU'].tolist(), test_size=0.2, random_state=42)

# Agr split
X_train_agr, X_test_agr, y_train_agr, y_test_agr = train_test_split(corpus, df['cAGR'].tolist(), test_size=0.2, random_state=42)

# Con split
X_train_con, X_test_con, y_train_con, y_test_con = train_test_split(corpus, df['cCON'].tolist(), test_size=0.2, random_state=42)

# Opn split
X_train_opn, X_test_opn, y_train_opn, y_test_opn = train_test_split(corpus, df['cOPN'].tolist(), test_size=0.2, random_state=42)

In [16]:
tfidf = TfidfVectorizer()  

# for Ext
X_train_ext_t = tfidf.fit_transform(X_train_ext)
X_test_ext_t = tfidf.transform(X_test_ext)

# for Neu
X_train_neu_t = tfidf.fit_transform(X_train_neu)
X_test_neu_t = tfidf.transform(X_test_neu)

# for Agr
X_train_agr_t = tfidf.fit_transform(X_train_agr)
X_test_agr_t = tfidf.transform(X_test_agr)

# for Con
X_train_con_t = tfidf.fit_transform(X_train_con)
X_test_con_t = tfidf.transform(X_test_con)

# for Opn
X_train_opn_t = tfidf.fit_transform(X_train_opn)
X_test_opn_t = tfidf.transform(X_test_opn)

### Hyperparameter tuning

In order to have the optimal XGBoost model, a Randomized Search is done to find the 'learning rate', 'gamma', 'max depth' and 'min child weigt' parameters of the XGBoost that yield the highest F1 score. This is done per target variable.

In [25]:
# Ext
xgb_model = xgb.XGBClassifier(objective='binary:hinge')

params_dict = {'learning_rate': [0.01, 0.1, 0.2],
               'gamma': [0.5, 1, 2.0],
               'max_depth': [3, 6, 9],
               'min_child_weight': [1, 5, 10],
              }

search = RandomizedSearchCV(xgb_model,
         param_distributions=params_dict,
         scoring = ['recall', 'precision', 'accuracy', 'f1'],
         refit = 'f1', 
         cv= 5,
         verbose=3)

search.fit(X_train_ext_t,y_train_ext)
opt_params_ext_t = search.best_params_
opt_params_ext_t

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=10; accuracy: (test=0.530) f1: (test=0.637) precision: (test=0.520) recall: (test=0.824) total time=   4.9s
[CV 2/5] END gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=10; accuracy: (test=0.525) f1: (test=0.635) precision: (test=0.517) recall: (test=0.824) total time=   5.2s
[CV 3/5] END gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=10; accuracy: (test=0.551) f1: (test=0.642) precision: (test=0.537) recall: (test=0.799) total time=   4.9s
[CV 4/5] END gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=10; accuracy: (test=0.519) f1: (test=0.617) precision: (test=0.515) recall: (test=0.770) total time=   5.0s
[CV 5/5] END gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=10; accuracy: (test=0.551) f1: (test=0.656) precision: (test=0.534) recall: (test=0.849) total time=   5.1s
[CV 1/5] END gamma=1, learning_rate=0.2, max

[CV 2/5] END gamma=2.0, learning_rate=0.1, max_depth=3, min_child_weight=10; accuracy: (test=0.532) f1: (test=0.638) precision: (test=0.521) recall: (test=0.824) total time=   5.3s
[CV 3/5] END gamma=2.0, learning_rate=0.1, max_depth=3, min_child_weight=10; accuracy: (test=0.538) f1: (test=0.629) precision: (test=0.528) recall: (test=0.778) total time=   4.7s
[CV 4/5] END gamma=2.0, learning_rate=0.1, max_depth=3, min_child_weight=10; accuracy: (test=0.540) f1: (test=0.638) precision: (test=0.529) recall: (test=0.803) total time=   5.4s
[CV 5/5] END gamma=2.0, learning_rate=0.1, max_depth=3, min_child_weight=10; accuracy: (test=0.568) f1: (test=0.666) precision: (test=0.545) recall: (test=0.854) total time=   6.0s


{'min_child_weight': 1, 'max_depth': 9, 'learning_rate': 0.01, 'gamma': 1}

In [26]:
opt_params_ext_t = {'min_child_weight': 1, 'max_depth': 9, 'learning_rate': 0.01, 'gamma': 1}

In [22]:
# Neu
xgb_model = xgb.XGBClassifier(objective='binary:hinge')

params_dict = {'learning_rate': [0.01, 0.1, 0.2],
               'gamma': [0.5, 1, 2.0],
               'max_depth': [3, 6, 9],
               'min_child_weight': [1, 5, 10],
              }

search = RandomizedSearchCV(xgb_model,
         param_distributions=params_dict,
         scoring = ['recall', 'precision', 'accuracy', 'f1'],
         refit = 'f1', 
         cv= 5,
         verbose=3)

search.fit(X_train_neu_t,y_train_neu)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END gamma=0.5, learning_rate=0.01, max_depth=6, min_child_weight=10; accuracy: (test=0.483) f1: (test=0.630) precision: (test=0.485) recall: (test=0.901) total time=  12.8s
[CV 2/5] END gamma=0.5, learning_rate=0.01, max_depth=6, min_child_weight=10; accuracy: (test=0.500) f1: (test=0.650) precision: (test=0.494) recall: (test=0.948) total time=  14.3s
[CV 3/5] END gamma=0.5, learning_rate=0.01, max_depth=6, min_child_weight=10; accuracy: (test=0.494) f1: (test=0.656) precision: (test=0.491) recall: (test=0.987) total time=  14.9s
[CV 4/5] END gamma=0.5, learning_rate=0.01, max_depth=6, min_child_weight=10; accuracy: (test=0.494) f1: (test=0.654) precision: (test=0.492) recall: (test=0.974) total time=  15.2s
[CV 5/5] END gamma=0.5, learning_rate=0.01, max_depth=6, min_child_weight=10; accuracy: (test=0.489) f1: (test=0.654) precision: (test=0.490) recall: (test=0.983) total time=  19.2s
[CV 1/5] END gamma=0.5, learn

[CV 2/5] END gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=5; accuracy: (test=0.513) f1: (test=0.616) precision: (test=0.501) recall: (test=0.797) total time=   5.4s
[CV 3/5] END gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=5; accuracy: (test=0.563) f1: (test=0.593) precision: (test=0.545) recall: (test=0.651) total time=   5.6s
[CV 4/5] END gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=5; accuracy: (test=0.542) f1: (test=0.508) precision: (test=0.538) recall: (test=0.481) total time=   5.3s
[CV 5/5] END gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=5; accuracy: (test=0.498) f1: (test=0.556) precision: (test=0.492) recall: (test=0.639) total time=   4.8s


{'min_child_weight': 10, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0.5}

In [27]:
opt_params_neu_t = {'min_child_weight': 10, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0.5}

In [23]:
# Agr
xgb_model = xgb.XGBClassifier(objective='binary:hinge')

params_dict = {'learning_rate': [0.01, 0.1, 0.2],
               'gamma': [0.5, 1, 2.0],
               'max_depth': [3, 6, 9],
               'min_child_weight': [1, 5, 10],
              }

search = RandomizedSearchCV(xgb_model,
         param_distributions=params_dict,
         scoring = ['recall', 'precision', 'accuracy', 'f1'],
         refit = 'f1', 
         cv= 5,
         verbose=3)

search.fit(X_train_agr_t,y_train_agr)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END gamma=2.0, learning_rate=0.2, max_depth=3, min_child_weight=1; accuracy: (test=0.555) f1: (test=0.600) precision: (test=0.570) recall: (test=0.632) total time=   3.7s
[CV 2/5] END gamma=2.0, learning_rate=0.2, max_depth=3, min_child_weight=1; accuracy: (test=0.540) f1: (test=0.627) precision: (test=0.546) recall: (test=0.735) total time=   3.8s
[CV 3/5] END gamma=2.0, learning_rate=0.2, max_depth=3, min_child_weight=1; accuracy: (test=0.519) f1: (test=0.593) precision: (test=0.534) recall: (test=0.667) total time=   3.6s
[CV 4/5] END gamma=2.0, learning_rate=0.2, max_depth=3, min_child_weight=1; accuracy: (test=0.527) f1: (test=0.571) precision: (test=0.546) recall: (test=0.598) total time=   5.2s
[CV 5/5] END gamma=2.0, learning_rate=0.2, max_depth=3, min_child_weight=1; accuracy: (test=0.536) f1: (test=0.609) precision: (test=0.546) recall: (test=0.687) total time=   5.2s
[CV 1/5] END gamma=0.5, learning_rate=0

[CV 2/5] END gamma=0.5, learning_rate=0.01, max_depth=3, min_child_weight=5; accuracy: (test=0.525) f1: (test=0.682) precision: (test=0.526) recall: (test=0.968) total time=   6.1s
[CV 3/5] END gamma=0.5, learning_rate=0.01, max_depth=3, min_child_weight=5; accuracy: (test=0.525) f1: (test=0.689) precision: (test=0.525) recall: (test=1.000) total time=   6.8s
[CV 4/5] END gamma=0.5, learning_rate=0.01, max_depth=3, min_child_weight=5; accuracy: (test=0.521) f1: (test=0.685) precision: (test=0.523) recall: (test=0.992) total time=   6.3s
[CV 5/5] END gamma=0.5, learning_rate=0.01, max_depth=3, min_child_weight=5; accuracy: (test=0.530) f1: (test=0.690) precision: (test=0.528) recall: (test=0.996) total time=   6.8s


{'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.5}

In [28]:
opt_params_agr_t = {'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.5}

In [29]:
# Con
xgb_model = xgb.XGBClassifier(objective='binary:hinge')

params_dict = {'learning_rate': [0.01, 0.1, 0.2],
               'gamma': [0.5, 1, 2.0],
               'max_depth': [3, 6, 9],
               'min_child_weight': [1, 5, 10],
              }

search = RandomizedSearchCV(xgb_model,
         param_distributions=params_dict,
         scoring = ['recall', 'precision', 'accuracy', 'f1'],
         refit = 'f1', 
         cv= 5,
         verbose=3)

search.fit(X_train_con_t,y_train_con)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END gamma=2.0, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.506) f1: (test=0.526) precision: (test=0.516) recall: (test=0.537) total time=  15.8s
[CV 2/5] END gamma=2.0, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.508) f1: (test=0.499) precision: (test=0.520) recall: (test=0.479) total time=  15.3s
[CV 3/5] END gamma=2.0, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.544) f1: (test=0.544) precision: (test=0.554) recall: (test=0.535) total time=  13.8s
[CV 4/5] END gamma=2.0, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.546) f1: (test=0.536) precision: (test=0.559) recall: (test=0.515) total time=  14.2s
[CV 5/5] END gamma=2.0, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.536) f1: (test=0.555) precision: (test=0.542) recall: (test=0.568) total time=  14.4s
[CV 1/5] END gamma=1, learning_rate=0.0

[CV 2/5] END gamma=1, learning_rate=0.2, max_depth=9, min_child_weight=1; accuracy: (test=0.511) f1: (test=0.525) precision: (test=0.520) recall: (test=0.529) total time=  17.2s
[CV 3/5] END gamma=1, learning_rate=0.2, max_depth=9, min_child_weight=1; accuracy: (test=0.525) f1: (test=0.534) precision: (test=0.533) recall: (test=0.535) total time=  20.3s
[CV 4/5] END gamma=1, learning_rate=0.2, max_depth=9, min_child_weight=1; accuracy: (test=0.502) f1: (test=0.530) precision: (test=0.510) recall: (test=0.552) total time=  31.5s
[CV 5/5] END gamma=1, learning_rate=0.2, max_depth=9, min_child_weight=1; accuracy: (test=0.565) f1: (test=0.576) precision: (test=0.571) recall: (test=0.581) total time=  22.4s


{'min_child_weight': 10, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0.5}

In [30]:
opt_params_con_t = {'min_child_weight': 10, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0.5}

In [31]:
# Opn
xgb_model = xgb.XGBClassifier(objective='binary:hinge')

params_dict = {'learning_rate': [0.01, 0.1, 0.2],
               'gamma': [0.5, 1, 2.0],
               'max_depth': [3, 6, 9],
               'min_child_weight': [1, 5, 10],
              }

search = RandomizedSearchCV(xgb_model,
         param_distributions=params_dict,
         scoring = ['recall', 'precision', 'accuracy', 'f1'],
         refit = 'f1', 
         cv= 5,
         verbose=3)

search.fit(X_train_opn_t,y_train_opn)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END gamma=1, learning_rate=0.01, max_depth=9, min_child_weight=1; accuracy: (test=0.527) f1: (test=0.636) precision: (test=0.509) recall: (test=0.848) total time= 1.0min
[CV 2/5] END gamma=1, learning_rate=0.01, max_depth=9, min_child_weight=1; accuracy: (test=0.525) f1: (test=0.651) precision: (test=0.506) recall: (test=0.913) total time=  55.5s
[CV 3/5] END gamma=1, learning_rate=0.01, max_depth=9, min_child_weight=1; accuracy: (test=0.551) f1: (test=0.650) precision: (test=0.522) recall: (test=0.861) total time= 1.2min
[CV 4/5] END gamma=1, learning_rate=0.01, max_depth=9, min_child_weight=1; accuracy: (test=0.521) f1: (test=0.645) precision: (test=0.504) recall: (test=0.896) total time= 1.0min
[CV 5/5] END gamma=1, learning_rate=0.01, max_depth=9, min_child_weight=1; accuracy: (test=0.534) f1: (test=0.607) precision: (test=0.514) recall: (test=0.743) total time= 1.2min
[CV 1/5] END gamma=0.5, learning_rate=0.1, m

[CV 2/5] END gamma=2.0, learning_rate=0.01, max_depth=9, min_child_weight=5; accuracy: (test=0.521) f1: (test=0.647) precision: (test=0.504) recall: (test=0.904) total time=  39.5s
[CV 3/5] END gamma=2.0, learning_rate=0.01, max_depth=9, min_child_weight=5; accuracy: (test=0.513) f1: (test=0.633) precision: (test=0.499) recall: (test=0.865) total time=  53.5s
[CV 4/5] END gamma=2.0, learning_rate=0.01, max_depth=9, min_child_weight=5; accuracy: (test=0.492) f1: (test=0.630) precision: (test=0.487) recall: (test=0.891) total time=  37.4s
[CV 5/5] END gamma=2.0, learning_rate=0.01, max_depth=9, min_child_weight=5; accuracy: (test=0.508) f1: (test=0.600) precision: (test=0.496) recall: (test=0.761) total time=  43.8s


{'min_child_weight': 1, 'max_depth': 9, 'learning_rate': 0.01, 'gamma': 1}

In [32]:
opt_params_opn_t = {'min_child_weight': 1, 'max_depth': 9, 'learning_rate': 0.01, 'gamma': 1}

### Actual training and testing

With the optimal parameters found, the XGBoost is trained on the training set and then tested for validation.

In [33]:
def XGB_model(X_train, X_test, y_train, y_test, opt_params):
    
    xgb_model = xgb.XGBClassifier(objective = 'binary:hinge', 
                              min_child_weight = opt_params['min_child_weight'], 
                              max_depth = opt_params['max_depth'], 
                              learning_rate = opt_params['learning_rate'], 
                              gamma = opt_params['gamma'])
    
    xgb_model.fit(X_train, y_train)

    predictions = xgb_model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)

    print("Accuracy: {:.2f}".format(accuracy))
    print("Precision: {:.2f}".format(precision))
    print("Recall: {:.2f}".format(recall))
    print("F1 Score: {:.2f}".format(f1))
    
    return [f1, accuracy, precision, recall]


In [34]:
# Ext
metrics_ext_t = XGB_model(X_train_ext_t, X_test_ext_t, y_train_ext, y_test_ext, opt_params_ext_t)

Accuracy: 0.53
Precision: 0.52
Recall: 0.90
F1 Score: 0.66


In [35]:
# Neu
metrics_neu_t = XGB_model(X_train_neu_t, X_test_neu_t, y_train_neu, y_test_neu, opt_params_neu_t)

Accuracy: 0.53
Precision: 0.54
Recall: 0.98
F1 Score: 0.70


In [36]:
# Agr
metrics_agr_t = XGB_model(X_train_agr_t, X_test_agr_t, y_train_agr, y_test_agr, opt_params_agr_t)

Accuracy: 0.49
Precision: 0.49
Recall: 1.00
F1 Score: 0.66


In [37]:
# Con
metrics_con_t = XGB_model(X_train_con_t, X_test_con_t, y_train_con, y_test_con, opt_params_con_t)

Accuracy: 0.53
Precision: 0.53
Recall: 0.97
F1 Score: 0.68


In [38]:
# Opn
metrics_opn_t = XGB_model(X_train_opn_t, X_test_opn_t, y_train_opn, y_test_opn, opt_params_opn_t)

Accuracy: 0.53
Precision: 0.50
Recall: 0.80
F1 Score: 0.62


## Using Part-of-Speech (PoS) tagging

As second method to test, PoS tagging as used. To do so, first the text inputs are tagged.

In [39]:
# Download NLTK data if not already downloaded
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\maxma\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [40]:
pos_df = df.copy()

def pos_tagging(text):
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    return [tag for word, tag in pos_tags]

pos_df['pos_tags'] = pos_df['TEXT'].apply(pos_tagging)

In [41]:
# Ext
vectorizer = CountVectorizer(tokenizer=lambda x: x, lowercase=False)
X = vectorizer.fit_transform(pos_df['pos_tags'])
y_ext = pos_df['cEXT']

X_train_ext_p, X_test_ext_p, y_train_ext_p, y_test_ext_p = train_test_split(X, y_ext, test_size=0.2, random_state=42)



In [42]:
# Neu
y_neu = pos_df['cNEU']

X_train_neu_p, X_test_neu_p, y_train_neu_p, y_test_neu_p = train_test_split(X, y_neu, test_size=0.2, random_state=42)

In [43]:
# Agr
y_agr = pos_df['cAGR']

X_train_agr_p, X_test_agr_p, y_train_agr_p, y_test_agr_p = train_test_split(X, y_agr, test_size=0.2, random_state=42)

In [44]:
# Con
y_con = pos_df['cCON']

X_train_con_p, X_test_con_p, y_train_con_p, y_test_con_p = train_test_split(X, y_con, test_size=0.2, random_state=42)

In [45]:
# Opn
y_opn = pos_df['cOPN']

X_train_opn_p, X_test_opn_p, y_train_opn_p, y_test_opn_p = train_test_split(X, y_opn, test_size=0.2, random_state=42)

### Hyperparameter tuning

In order to have the optimal XGBoost model, a Randomized Search is done to find the 'learning rate', 'gamma', 'max depth' and 'min child weigt' parameters of the XGBoost that yield the highest F1 score. This is done per target variable.

In [46]:
# Ext
xgb_model = xgb.XGBClassifier(objective='binary:hinge')

params_dict = {'learning_rate': [0.01, 0.1, 0.2],
               'gamma': [0.5, 1, 2.0],
               'max_depth': [3, 6, 9],
               'min_child_weight': [1, 5, 10],
              }

search = RandomizedSearchCV(xgb_model,
         param_distributions=params_dict,
         scoring = ['recall', 'precision', 'accuracy', 'f1'],
         refit = 'f1', 
         cv= 5,
         verbose=3)

search.fit(X_train_ext_p,y_train_ext)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END gamma=0.5, learning_rate=0.01, max_depth=6, min_child_weight=5; accuracy: (test=0.498) f1: (test=0.651) precision: (test=0.500) recall: (test=0.933) total time=   0.0s
[CV 2/5] END gamma=0.5, learning_rate=0.01, max_depth=6, min_child_weight=5; accuracy: (test=0.517) f1: (test=0.671) precision: (test=0.510) recall: (test=0.983) total time=   0.0s
[CV 3/5] END gamma=0.5, learning_rate=0.01, max_depth=6, min_child_weight=5; accuracy: (test=0.508) f1: (test=0.668) precision: (test=0.506) recall: (test=0.979) total time=   0.0s
[CV 4/5] END gamma=0.5, learning_rate=0.01, max_depth=6, min_child_weight=5; accuracy: (test=0.506) f1: (test=0.662) precision: (test=0.506) recall: (test=0.958) total time=   0.0s
[CV 5/5] END gamma=0.5, learning_rate=0.01, max_depth=6, min_child_weight=5; accuracy: (test=0.500) f1: (test=0.650) precision: (test=0.502) recall: (test=0.921) total time=   0.0s
[CV 1/5] END gamma=1, learning_rat

[CV 2/5] END gamma=2.0, learning_rate=0.01, max_depth=9, min_child_weight=1; accuracy: (test=0.542) f1: (test=0.669) precision: (test=0.525) recall: (test=0.920) total time=   0.4s
[CV 3/5] END gamma=2.0, learning_rate=0.01, max_depth=9, min_child_weight=1; accuracy: (test=0.530) f1: (test=0.654) precision: (test=0.520) recall: (test=0.883) total time=   0.5s
[CV 4/5] END gamma=2.0, learning_rate=0.01, max_depth=9, min_child_weight=1; accuracy: (test=0.508) f1: (test=0.639) precision: (test=0.507) recall: (test=0.862) total time=   0.4s
[CV 5/5] END gamma=2.0, learning_rate=0.01, max_depth=9, min_child_weight=1; accuracy: (test=0.489) f1: (test=0.619) precision: (test=0.496) recall: (test=0.824) total time=   0.4s


{'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 2.0}

In [51]:
opt_params_ext_p = {'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 2.0}

In [47]:
# Neu
xgb_model = xgb.XGBClassifier(objective='binary:hinge')

params_dict = {'learning_rate': [0.01, 0.1, 0.2],
               'gamma': [0.5, 1, 2.0],
               'max_depth': [3, 6, 9],
               'min_child_weight': [1, 5, 10],
              }

search = RandomizedSearchCV(xgb_model,
         param_distributions=params_dict,
         scoring = ['recall', 'precision', 'accuracy', 'f1'],
         refit = 'f1', 
         cv= 5,
         verbose=3)

search.fit(X_train_neu_p,y_train_neu)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END gamma=2.0, learning_rate=0.2, max_depth=3, min_child_weight=1; accuracy: (test=0.506) f1: (test=0.482) precision: (test=0.495) recall: (test=0.470) total time=   0.0s
[CV 2/5] END gamma=2.0, learning_rate=0.2, max_depth=3, min_child_weight=1; accuracy: (test=0.496) f1: (test=0.477) precision: (test=0.484) recall: (test=0.470) total time=   0.0s
[CV 3/5] END gamma=2.0, learning_rate=0.2, max_depth=3, min_child_weight=1; accuracy: (test=0.487) f1: (test=0.441) precision: (test=0.473) recall: (test=0.414) total time=   0.0s
[CV 4/5] END gamma=2.0, learning_rate=0.2, max_depth=3, min_child_weight=1; accuracy: (test=0.532) f1: (test=0.439) precision: (test=0.534) recall: (test=0.373) total time=   0.0s
[CV 5/5] END gamma=2.0, learning_rate=0.2, max_depth=3, min_child_weight=1; accuracy: (test=0.494) f1: (test=0.467) precision: (test=0.484) recall: (test=0.451) total time=   0.0s
[CV 1/5] END gamma=0.5, learning_rate=0

[CV 2/5] END gamma=2.0, learning_rate=0.01, max_depth=6, min_child_weight=5; accuracy: (test=0.485) f1: (test=0.648) precision: (test=0.487) recall: (test=0.970) total time=   0.0s
[CV 3/5] END gamma=2.0, learning_rate=0.01, max_depth=6, min_child_weight=5; accuracy: (test=0.487) f1: (test=0.653) precision: (test=0.488) recall: (test=0.987) total time=   0.0s
[CV 4/5] END gamma=2.0, learning_rate=0.01, max_depth=6, min_child_weight=5; accuracy: (test=0.492) f1: (test=0.652) precision: (test=0.491) recall: (test=0.970) total time=   0.0s
[CV 5/5] END gamma=2.0, learning_rate=0.01, max_depth=6, min_child_weight=5; accuracy: (test=0.489) f1: (test=0.655) precision: (test=0.490) recall: (test=0.987) total time=   0.0s


{'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.5}

In [52]:
opt_params_neu_p = {'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.5}

In [48]:
# Agr
xgb_model = xgb.XGBClassifier(objective='binary:hinge')

params_dict = {'learning_rate': [0.01, 0.1, 0.2],
               'gamma': [0.5, 1, 2.0],
               'max_depth': [3, 6, 9],
               'min_child_weight': [1, 5, 10],
              }

search = RandomizedSearchCV(xgb_model,
         param_distributions=params_dict,
         scoring = ['recall', 'precision', 'accuracy', 'f1'],
         refit = 'f1', 
         cv= 5,
         verbose=3)

search.fit(X_train_agr_p,y_train_agr)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END gamma=2.0, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.555) f1: (test=0.581) precision: (test=0.577) recall: (test=0.584) total time=   0.1s
[CV 2/5] END gamma=2.0, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.502) f1: (test=0.514) precision: (test=0.527) recall: (test=0.502) total time=   0.0s
[CV 3/5] END gamma=2.0, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.496) f1: (test=0.546) precision: (test=0.518) recall: (test=0.578) total time=   0.0s
[CV 4/5] END gamma=2.0, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.536) f1: (test=0.567) precision: (test=0.556) recall: (test=0.578) total time=   0.0s
[CV 5/5] END gamma=2.0, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.504) f1: (test=0.547) precision: (test=0.526) recall: (test=0.570) total time=   0.1s
[CV 1/5] END gamma=0.5, learning_rate=0

[CV 2/5] END gamma=0.5, learning_rate=0.1, max_depth=9, min_child_weight=10; accuracy: (test=0.519) f1: (test=0.551) precision: (test=0.541) recall: (test=0.562) total time=   0.2s
[CV 3/5] END gamma=0.5, learning_rate=0.1, max_depth=9, min_child_weight=10; accuracy: (test=0.551) f1: (test=0.591) precision: (test=0.566) recall: (test=0.618) total time=   0.2s
[CV 4/5] END gamma=0.5, learning_rate=0.1, max_depth=9, min_child_weight=10; accuracy: (test=0.521) f1: (test=0.538) precision: (test=0.545) recall: (test=0.530) total time=   0.1s
[CV 5/5] END gamma=0.5, learning_rate=0.1, max_depth=9, min_child_weight=10; accuracy: (test=0.561) f1: (test=0.595) precision: (test=0.577) recall: (test=0.614) total time=   0.2s


{'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 1}

In [53]:
opt_params_agr_p = {'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 1}

In [49]:
# Con
xgb_model = xgb.XGBClassifier(objective='binary:hinge')

params_dict = {'learning_rate': [0.01, 0.1, 0.2],
               'gamma': [0.5, 1, 2.0],
               'max_depth': [3, 6, 9],
               'min_child_weight': [1, 5, 10],
              }

search = RandomizedSearchCV(xgb_model,
         param_distributions=params_dict,
         scoring = ['recall', 'precision', 'accuracy', 'f1'],
         refit = 'f1', 
         cv= 5,
         verbose=3)

search.fit(X_train_con_p,y_train_con)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END gamma=0.5, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.519) f1: (test=0.537) precision: (test=0.528) recall: (test=0.545) total time=   0.0s
[CV 2/5] END gamma=0.5, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.515) f1: (test=0.509) precision: (test=0.527) recall: (test=0.492) total time=   0.0s
[CV 3/5] END gamma=0.5, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.519) f1: (test=0.525) precision: (test=0.527) recall: (test=0.523) total time=   0.0s
[CV 4/5] END gamma=0.5, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.515) f1: (test=0.547) precision: (test=0.521) recall: (test=0.577) total time=   0.0s
[CV 5/5] END gamma=0.5, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.530) f1: (test=0.548) precision: (test=0.536) recall: (test=0.560) total time=   0.1s
[CV 1/5] END gamma=1, learning_rate=0.2

[CV 2/5] END gamma=2.0, learning_rate=0.01, max_depth=3, min_child_weight=10; accuracy: (test=0.502) f1: (test=0.663) precision: (test=0.507) recall: (test=0.959) total time=   0.0s
[CV 3/5] END gamma=2.0, learning_rate=0.01, max_depth=3, min_child_weight=10; accuracy: (test=0.504) f1: (test=0.661) precision: (test=0.507) recall: (test=0.950) total time=   0.0s
[CV 4/5] END gamma=2.0, learning_rate=0.01, max_depth=3, min_child_weight=10; accuracy: (test=0.504) f1: (test=0.669) precision: (test=0.506) recall: (test=0.983) total time=   0.1s
[CV 5/5] END gamma=2.0, learning_rate=0.01, max_depth=3, min_child_weight=10; accuracy: (test=0.506) f1: (test=0.669) precision: (test=0.507) recall: (test=0.983) total time=   0.0s


{'min_child_weight': 10, 'max_depth': 9, 'learning_rate': 0.01, 'gamma': 1}

In [54]:
opt_params_con_p = {'min_child_weight': 10, 'max_depth': 9, 'learning_rate': 0.01, 'gamma': 1}

In [50]:
# Opn
xgb_model = xgb.XGBClassifier(objective='binary:hinge')

params_dict = {'learning_rate': [0.01, 0.1, 0.2],
               'gamma': [0.5, 1, 2.0],
               'max_depth': [3, 6, 9],
               'min_child_weight': [1, 5, 10],
              }

search = RandomizedSearchCV(xgb_model,
         param_distributions=params_dict,
         scoring = ['recall', 'precision', 'accuracy', 'f1'],
         refit = 'f1', 
         cv= 5,
         verbose=3)

search.fit(X_train_opn_p,y_train_opn)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END gamma=1, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.557) f1: (test=0.545) precision: (test=0.545) recall: (test=0.545) total time=   0.1s
[CV 2/5] END gamma=1, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.532) f1: (test=0.515) precision: (test=0.518) recall: (test=0.513) total time=   0.1s
[CV 3/5] END gamma=1, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.542) f1: (test=0.525) precision: (test=0.529) recall: (test=0.522) total time=   0.1s
[CV 4/5] END gamma=1, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.506) f1: (test=0.498) precision: (test=0.492) recall: (test=0.504) total time=   0.2s
[CV 5/5] END gamma=1, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.525) f1: (test=0.518) precision: (test=0.511) recall: (test=0.526) total time=   0.2s
[CV 1/5] END gamma=2.0, learning_rate=0.01, max_d

[CV 3/5] END gamma=0.5, learning_rate=0.2, max_depth=3, min_child_weight=10; accuracy: (test=0.542) f1: (test=0.551) precision: (test=0.526) recall: (test=0.578) total time=   0.0s
[CV 4/5] END gamma=0.5, learning_rate=0.2, max_depth=3, min_child_weight=10; accuracy: (test=0.536) f1: (test=0.567) precision: (test=0.518) recall: (test=0.626) total time=   0.0s
[CV 5/5] END gamma=0.5, learning_rate=0.2, max_depth=3, min_child_weight=10; accuracy: (test=0.589) f1: (test=0.621) precision: (test=0.561) recall: (test=0.696) total time=   0.0s


{'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 2.0}

In [55]:
opt_params_opn_p = {'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 2.0}

### Actual training and testing

With the optimal parameters found, the XGBoost is trained on the training set and then tested for validation.

In [56]:
# Ext
metrics_ext_p = XGB_model(X_train_ext_p, X_test_ext_p, y_train_ext, y_test_ext, opt_params_ext_p)

Accuracy: 0.50
Precision: 0.50
Recall: 0.97
F1 Score: 0.66


In [57]:
# Neu
metrics_neu_p = XGB_model(X_train_neu_p, X_test_neu_p, y_train_neu, y_test_neu, opt_params_neu_p)

Accuracy: 0.54
Precision: 0.54
Recall: 1.00
F1 Score: 0.70


In [58]:
# Agr
metrics_agr_p = XGB_model(X_train_agr_p, X_test_agr_p, y_train_agr, y_test_agr, opt_params_agr_p)

Accuracy: 0.49
Precision: 0.49
Recall: 0.97
F1 Score: 0.65


In [59]:
# Con
metrics_con_p = XGB_model(X_train_con_p, X_test_con_p, y_train_con, y_test_con, opt_params_con_p)

Accuracy: 0.51
Precision: 0.52
Recall: 0.94
F1 Score: 0.67


In [60]:
# Opn
metrics_opn_p = XGB_model(X_train_opn_p, X_test_opn_p, y_train_opn, y_test_opn, opt_params_opn_p)

Accuracy: 0.50
Precision: 0.48
Recall: 0.97
F1 Score: 0.65


## Using Bag-of-Words (BoW)

Firstly Converts the texts into Bag-of-Words representation and determine the train-test splits

In [61]:
vectorizer = CountVectorizer()
X_b = vectorizer.fit_transform(df['TEXT'])

In [62]:
# Ext
X_train_ext_b, X_test_ext_b, y_train_ext_b, y_test_ext_b = train_test_split(X_b, y_ext, test_size=0.2, random_state=42)

# Neu
X_train_neu_b, X_test_neu_b, y_train_neu_b, y_test_neu_b = train_test_split(X_b, y_neu, test_size=0.2, random_state=42)

# Agr
X_train_agr_b, X_test_agr_b, y_train_agr_b, y_test_agr_b = train_test_split(X_b, y_agr, test_size=0.2, random_state=42)

# Con
X_train_con_b, X_test_con_b, y_train_con_b, y_test_con_b = train_test_split(X_b, y_con, test_size=0.2, random_state=42)

# Opn
X_train_opn_b, X_test_opn_b, y_train_opn_b, y_test_opn_b = train_test_split(X_b, y_opn, test_size=0.2, random_state=42)

### Hyperparameter tuning

In order to have the optimal XGBoost model, a Randomized Search is done to find the 'learning rate', 'gamma', 'max depth' and 'min child weigt' parameters of the XGBoost that yield the highest F1 score. This is done per target variable.

In [63]:
# Ext
xgb_model = xgb.XGBClassifier(objective='binary:hinge')

params_dict = {'learning_rate': [0.01, 0.1, 0.2],
               'gamma': [0.5, 1, 2.0],
               'max_depth': [3, 6, 9],
               'min_child_weight': [1, 5, 10],
              }

search = RandomizedSearchCV(xgb_model,
         param_distributions=params_dict,
         scoring = ['recall', 'precision', 'accuracy', 'f1'],
         refit = 'f1', 
         cv= 5,
         verbose=3)

search.fit(X_train_ext_b,y_train_ext)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END gamma=2.0, learning_rate=0.1, max_depth=6, min_child_weight=5; accuracy: (test=0.555) f1: (test=0.577) precision: (test=0.552) recall: (test=0.605) total time=   2.5s
[CV 2/5] END gamma=2.0, learning_rate=0.1, max_depth=6, min_child_weight=5; accuracy: (test=0.527) f1: (test=0.574) precision: (test=0.524) recall: (test=0.634) total time=   2.5s
[CV 3/5] END gamma=2.0, learning_rate=0.1, max_depth=6, min_child_weight=5; accuracy: (test=0.551) f1: (test=0.609) precision: (test=0.542) recall: (test=0.695) total time=   2.5s
[CV 4/5] END gamma=2.0, learning_rate=0.1, max_depth=6, min_child_weight=5; accuracy: (test=0.572) f1: (test=0.625) precision: (test=0.560) recall: (test=0.707) total time=   3.8s
[CV 5/5] END gamma=2.0, learning_rate=0.1, max_depth=6, min_child_weight=5; accuracy: (test=0.559) f1: (test=0.584) precision: (test=0.557) recall: (test=0.615) total time=   5.1s
[CV 1/5] END gamma=0.5, learning_rate=0

[CV 2/5] END gamma=1, learning_rate=0.2, max_depth=6, min_child_weight=1; accuracy: (test=0.517) f1: (test=0.508) precision: (test=0.520) recall: (test=0.496) total time=   4.6s
[CV 3/5] END gamma=1, learning_rate=0.2, max_depth=6, min_child_weight=1; accuracy: (test=0.544) f1: (test=0.552) precision: (test=0.547) recall: (test=0.556) total time=   4.8s
[CV 4/5] END gamma=1, learning_rate=0.2, max_depth=6, min_child_weight=1; accuracy: (test=0.580) f1: (test=0.595) precision: (test=0.579) recall: (test=0.611) total time=   4.2s
[CV 5/5] END gamma=1, learning_rate=0.2, max_depth=6, min_child_weight=1; accuracy: (test=0.530) f1: (test=0.536) precision: (test=0.533) recall: (test=0.540) total time=   4.4s


{'min_child_weight': 10, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.5}

In [68]:
opt_params_ext_b = {'min_child_weight': 10, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.5}

In [64]:
# Neu
xgb_model = xgb.XGBClassifier(objective='binary:hinge')

params_dict = {'learning_rate': [0.01, 0.1, 0.2],
               'gamma': [0.5, 1, 2.0],
               'max_depth': [3, 6, 9],
               'min_child_weight': [1, 5, 10],
              }

search = RandomizedSearchCV(xgb_model,
         param_distributions=params_dict,
         scoring = ['recall', 'precision', 'accuracy', 'f1'],
         refit = 'f1', 
         cv= 5,
         verbose=3)

search.fit(X_train_neu_b,y_train_neu)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END gamma=0.5, learning_rate=0.1, max_depth=6, min_child_weight=5; accuracy: (test=0.578) f1: (test=0.580) precision: (test=0.566) recall: (test=0.595) total time=   3.4s
[CV 2/5] END gamma=0.5, learning_rate=0.1, max_depth=6, min_child_weight=5; accuracy: (test=0.563) f1: (test=0.580) precision: (test=0.548) recall: (test=0.616) total time=   4.7s
[CV 3/5] END gamma=0.5, learning_rate=0.1, max_depth=6, min_child_weight=5; accuracy: (test=0.536) f1: (test=0.520) precision: (test=0.527) recall: (test=0.513) total time=   4.3s
[CV 4/5] END gamma=0.5, learning_rate=0.1, max_depth=6, min_child_weight=5; accuracy: (test=0.555) f1: (test=0.524) precision: (test=0.552) recall: (test=0.498) total time=   4.4s
[CV 5/5] END gamma=0.5, learning_rate=0.1, max_depth=6, min_child_weight=5; accuracy: (test=0.532) f1: (test=0.522) precision: (test=0.524) recall: (test=0.519) total time=   3.7s
[CV 1/5] END gamma=0.5, learning_rate=0

[CV 2/5] END gamma=0.5, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.508) f1: (test=0.512) precision: (test=0.498) recall: (test=0.526) total time=   5.6s
[CV 3/5] END gamma=0.5, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.565) f1: (test=0.554) precision: (test=0.557) recall: (test=0.552) total time=   6.1s
[CV 4/5] END gamma=0.5, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.536) f1: (test=0.509) precision: (test=0.530) recall: (test=0.489) total time=   5.9s
[CV 5/5] END gamma=0.5, learning_rate=0.2, max_depth=9, min_child_weight=5; accuracy: (test=0.513) f1: (test=0.499) precision: (test=0.504) recall: (test=0.494) total time=   5.6s


{'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 2.0}

In [69]:
opt_params_neu_b = {'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 2.0}

In [65]:
# Agr
xgb_model = xgb.XGBClassifier(objective='binary:hinge')

params_dict = {'learning_rate': [0.01, 0.1, 0.2],
               'gamma': [0.5, 1, 2.0],
               'max_depth': [3, 6, 9],
               'min_child_weight': [1, 5, 10],
              }

search = RandomizedSearchCV(xgb_model,
         param_distributions=params_dict,
         scoring = ['recall', 'precision', 'accuracy', 'f1'],
         refit = 'f1', 
         cv= 5,
         verbose=3)

search.fit(X_train_agr_b,y_train_agr)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END gamma=2.0, learning_rate=0.2, max_depth=3, min_child_weight=1; accuracy: (test=0.538) f1: (test=0.645) precision: (test=0.542) recall: (test=0.796) total time=   0.9s
[CV 2/5] END gamma=2.0, learning_rate=0.2, max_depth=3, min_child_weight=1; accuracy: (test=0.536) f1: (test=0.632) precision: (test=0.542) recall: (test=0.759) total time=   1.4s
[CV 3/5] END gamma=2.0, learning_rate=0.2, max_depth=3, min_child_weight=1; accuracy: (test=0.521) f1: (test=0.633) precision: (test=0.530) recall: (test=0.787) total time=   1.4s
[CV 4/5] END gamma=2.0, learning_rate=0.2, max_depth=3, min_child_weight=1; accuracy: (test=0.555) f1: (test=0.659) precision: (test=0.551) recall: (test=0.819) total time=   1.5s
[CV 5/5] END gamma=2.0, learning_rate=0.2, max_depth=3, min_child_weight=1; accuracy: (test=0.523) f1: (test=0.620) precision: (test=0.533) recall: (test=0.739) total time=   1.4s
[CV 1/5] END gamma=0.5, learning_rate=0

[CV 2/5] END gamma=0.5, learning_rate=0.01, max_depth=6, min_child_weight=10; accuracy: (test=0.534) f1: (test=0.687) precision: (test=0.531) recall: (test=0.972) total time=   4.9s
[CV 3/5] END gamma=0.5, learning_rate=0.01, max_depth=6, min_child_weight=10; accuracy: (test=0.540) f1: (test=0.687) precision: (test=0.535) recall: (test=0.960) total time=   4.5s
[CV 4/5] END gamma=0.5, learning_rate=0.01, max_depth=6, min_child_weight=10; accuracy: (test=0.536) f1: (test=0.686) precision: (test=0.532) recall: (test=0.964) total time=   3.8s
[CV 5/5] END gamma=0.5, learning_rate=0.01, max_depth=6, min_child_weight=10; accuracy: (test=0.538) f1: (test=0.691) precision: (test=0.533) recall: (test=0.984) total time=   4.7s


{'min_child_weight': 10, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0.5}

In [70]:
opt_params_agr_b = {'min_child_weight': 10, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0.5}

In [66]:
# Con
xgb_model = xgb.XGBClassifier(objective='binary:hinge')

params_dict = {'learning_rate': [0.01, 0.1, 0.2],
               'gamma': [0.5, 1, 2.0],
               'max_depth': [3, 6, 9],
               'min_child_weight': [1, 5, 10],
              }

search = RandomizedSearchCV(xgb_model,
         param_distributions=params_dict,
         scoring = ['recall', 'precision', 'accuracy', 'f1'],
         refit = 'f1', 
         cv= 5,
         verbose=3)

search.fit(X_train_con_b,y_train_con)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END gamma=1, learning_rate=0.2, max_depth=6, min_child_weight=10; accuracy: (test=0.525) f1: (test=0.534) precision: (test=0.535) recall: (test=0.533) total time=   2.9s
[CV 2/5] END gamma=1, learning_rate=0.2, max_depth=6, min_child_weight=10; accuracy: (test=0.498) f1: (test=0.520) precision: (test=0.508) recall: (test=0.533) total time=   3.9s
[CV 3/5] END gamma=1, learning_rate=0.2, max_depth=6, min_child_weight=10; accuracy: (test=0.546) f1: (test=0.545) precision: (test=0.556) recall: (test=0.535) total time=   3.6s
[CV 4/5] END gamma=1, learning_rate=0.2, max_depth=6, min_child_weight=10; accuracy: (test=0.536) f1: (test=0.555) precision: (test=0.542) recall: (test=0.568) total time=   3.3s
[CV 5/5] END gamma=1, learning_rate=0.2, max_depth=6, min_child_weight=10; accuracy: (test=0.544) f1: (test=0.546) precision: (test=0.553) recall: (test=0.539) total time=   3.6s
[CV 1/5] END gamma=2.0, learning_rate=0.01, 

[CV 2/5] END gamma=2.0, learning_rate=0.1, max_depth=3, min_child_weight=5; accuracy: (test=0.521) f1: (test=0.633) precision: (test=0.520) recall: (test=0.810) total time=   1.1s
[CV 3/5] END gamma=2.0, learning_rate=0.1, max_depth=3, min_child_weight=5; accuracy: (test=0.521) f1: (test=0.648) precision: (test=0.517) recall: (test=0.867) total time=   1.1s
[CV 4/5] END gamma=2.0, learning_rate=0.1, max_depth=3, min_child_weight=5; accuracy: (test=0.544) f1: (test=0.649) precision: (test=0.533) recall: (test=0.830) total time=   1.0s
[CV 5/5] END gamma=2.0, learning_rate=0.1, max_depth=3, min_child_weight=5; accuracy: (test=0.540) f1: (test=0.669) precision: (test=0.528) recall: (test=0.913) total time=   1.1s


{'min_child_weight': 5, 'max_depth': 9, 'learning_rate': 0.01, 'gamma': 2.0}

In [71]:
opt_params_con_b = {'min_child_weight': 5, 'max_depth': 9, 'learning_rate': 0.01, 'gamma': 2.0}

In [67]:
# Opn
xgb_model = xgb.XGBClassifier(objective='binary:hinge')

params_dict = {'learning_rate': [0.01, 0.1, 0.2],
               'gamma': [0.5, 1, 2.0],
               'max_depth': [3, 6, 9],
               'min_child_weight': [1, 5, 10],
              }

search = RandomizedSearchCV(xgb_model,
         param_distributions=params_dict,
         scoring = ['recall', 'precision', 'accuracy', 'f1'],
         refit = 'f1', 
         cv= 5,
         verbose=3)

search.fit(X_train_opn_b,y_train_opn)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END gamma=0.5, learning_rate=0.2, max_depth=9, min_child_weight=10; accuracy: (test=0.546) f1: (test=0.523) precision: (test=0.536) recall: (test=0.511) total time=   3.6s
[CV 2/5] END gamma=0.5, learning_rate=0.2, max_depth=9, min_child_weight=10; accuracy: (test=0.572) f1: (test=0.563) precision: (test=0.557) recall: (test=0.570) total time=   5.4s
[CV 3/5] END gamma=0.5, learning_rate=0.2, max_depth=9, min_child_weight=10; accuracy: (test=0.538) f1: (test=0.510) precision: (test=0.525) recall: (test=0.496) total time=   5.9s
[CV 4/5] END gamma=0.5, learning_rate=0.2, max_depth=9, min_child_weight=10; accuracy: (test=0.559) f1: (test=0.554) precision: (test=0.544) recall: (test=0.565) total time=   5.1s
[CV 5/5] END gamma=0.5, learning_rate=0.2, max_depth=9, min_child_weight=10; accuracy: (test=0.565) f1: (test=0.556) precision: (test=0.551) recall: (test=0.561) total time=   5.1s
[CV 1/5] END gamma=0.5, learning_r

[CV 2/5] END gamma=1, learning_rate=0.1, max_depth=9, min_child_weight=5; accuracy: (test=0.586) f1: (test=0.561) precision: (test=0.579) recall: (test=0.543) total time=   7.9s
[CV 3/5] END gamma=1, learning_rate=0.1, max_depth=9, min_child_weight=5; accuracy: (test=0.568) f1: (test=0.563) precision: (test=0.552) recall: (test=0.574) total time=   7.0s
[CV 4/5] END gamma=1, learning_rate=0.1, max_depth=9, min_child_weight=5; accuracy: (test=0.565) f1: (test=0.576) precision: (test=0.547) recall: (test=0.609) total time=   7.6s
[CV 5/5] END gamma=1, learning_rate=0.1, max_depth=9, min_child_weight=5; accuracy: (test=0.542) f1: (test=0.519) precision: (test=0.529) recall: (test=0.509) total time=   7.1s


{'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 2.0}

In [72]:
opt_params_opn_b = {'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 2.0}

### Training and testing

With the optimal parameters found, the XGBoost is trained on the training set and then tested for validation.

In [73]:
# Ext
metrics_ext_b = XGB_model(X_train_ext_b, X_test_ext_b, y_train_ext, y_test_ext, opt_params_ext_b)

Accuracy: 0.52
Precision: 0.51
Recall: 0.95
F1 Score: 0.67


In [74]:
# Neu
metrics_neu_b = XGB_model(X_train_neu_b, X_test_neu_b, y_train_neu, y_test_neu, opt_params_neu_b)

Accuracy: 0.53
Precision: 0.54
Recall: 0.98
F1 Score: 0.69


In [75]:
# Agr
metrics_agr_b = XGB_model(X_train_agr_b, X_test_agr_b, y_train_agr, y_test_agr, opt_params_agr_b)

Accuracy: 0.51
Precision: 0.50
Recall: 0.98
F1 Score: 0.66


In [76]:
# Con
metrics_con_b = XGB_model(X_train_con_b, X_test_con_b, y_train_con, y_test_con, opt_params_con_b)

Accuracy: 0.51
Precision: 0.52
Recall: 0.92
F1 Score: 0.66


In [77]:
# Opn
metrics_opn_b = XGB_model(X_train_opn_b, X_test_opn_b, y_train_opn, y_test_opn, opt_params_opn_b)

Accuracy: 0.47
Precision: 0.47
Recall: 1.00
F1 Score: 0.64


# Comparison of the different techniques per target variable

With the different feeding techniques done, the metrics can be identified. These are used to compare the methods to see what method would work best.

In [78]:
# Ext
metrics_ext = pd.DataFrame([metrics_ext_t, metrics_ext_p, metrics_ext_b], columns=['f1', 'accuracy', 'precision', 'recall'])
metrics_ext = metrics_ext.transpose()
metrics_ext.columns = ['TF-IDF', 'PoS', 'BoW']
metrics_ext = metrics_ext.transpose()
metrics_ext

Unnamed: 0,f1,accuracy,precision,recall
TF-IDF,0.655215,0.526138,0.516441,0.895973
PoS,0.662085,0.50253,0.502609,0.969799
BoW,0.668235,0.524452,0.514493,0.95302


In [79]:
# Neu
metrics_neu = pd.DataFrame([metrics_neu_t, metrics_neu_p, metrics_neu_b], columns=['f1', 'accuracy', 'precision', 'recall'])
metrics_neu = metrics_neu.transpose()
metrics_neu.columns = ['TF-IDF', 'PoS', 'BoW']
metrics_neu = metrics_neu.transpose()
metrics_neu

Unnamed: 0,f1,accuracy,precision,recall
TF-IDF,0.695364,0.53457,0.537543,0.984375
PoS,0.700986,0.539629,0.539629,1.0
BoW,0.694598,0.532884,0.536627,0.984375


In [80]:
# Agr
metrics_agr = pd.DataFrame([metrics_agr_t, metrics_agr_p, metrics_agr_b], columns=['f1', 'accuracy', 'precision', 'recall'])
metrics_agr = metrics_agr.transpose()
metrics_agr.columns = ['TF-IDF', 'PoS', 'BoW']
metrics_agr = metrics_agr.transpose()
metrics_agr

Unnamed: 0,f1,accuracy,precision,recall
TF-IDF,0.658371,0.490725,0.490725,1.0
PoS,0.651323,0.489039,0.489619,0.972509
BoW,0.660487,0.505902,0.498252,0.979381


In [81]:
# Con
metrics_con = pd.DataFrame([metrics_con_t, metrics_con_p, metrics_con_b], columns=['f1', 'accuracy', 'precision', 'recall'])
metrics_con = metrics_con.transpose()
metrics_con.columns = ['TF-IDF', 'PoS', 'BoW']
metrics_con = metrics_con.transpose()
metrics_con

Unnamed: 0,f1,accuracy,precision,recall
TF-IDF,0.683202,0.526138,0.526042,0.974277
PoS,0.668187,0.509275,0.517668,0.942122
BoW,0.661272,0.505902,0.516245,0.919614


In [82]:
# Opn
metrics_opn = pd.DataFrame([metrics_opn_t, metrics_opn_p, metrics_opn_b], columns=['f1', 'accuracy', 'precision', 'recall'])
metrics_opn = metrics_opn.transpose()
metrics_opn.columns = ['TF-IDF', 'PoS', 'BoW']
metrics_opn = metrics_opn.transpose()
metrics_opn

Unnamed: 0,f1,accuracy,precision,recall
TF-IDF,0.616874,0.532884,0.503386,0.796429
PoS,0.645314,0.495784,0.483126,0.971429
BoW,0.639908,0.470489,0.471284,0.996429


# Conclusions and Implications

In the table below, the best method yielding the best F1 score per target variable is depicted.

| Target variable | Best Method | F1 Score |
| --- | --- | --- |
| Ext | BoW | 0.668235 |
| Neu | PoS | 0.700986 |
| Agr | BoW | 0.660487 |
| Con | TF-IDF | 0.683202 |
| Opn | PoS | 0.645314 |

Comparing this to the results of the SVM, it can be determined that XGBoost improves the prediction performance for all target variables. Especcialy Neuroticism and Openness can be predicted much better than an SVM, whereas the other 3 target variables display similar F1 scores as the SVM. Per target variable, the feeding technique again differs.
