In [23]:
from itertools import product
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

essays=pd.read_csv("../data/essays_expanded.csv")

essays.head()

Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN,words,sentences,bigrams,trigrams,average_word_embedding
0,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1,"['well', 'right', 'now', 'i', 'just', 'woke', ...","['well, right now i just woke up from a mid-da...","[('well', 'right'), ('right', 'now'), ('now', ...","[('well', 'right', 'now'), ('right', 'now', 'i...",[ 1.46904569e-02 1.52049020e-01 -2.17639774e-...
1,"Well, here we go with the stream of consciousn...",0,0,1,0,0,"['well', 'here', 'we', 'go', 'with', 'the', 's...","['well, here we go with the stream of consciou...","[('well', 'here'), ('here', 'we'), ('we', 'go'...","[('well', 'here', 'we'), ('here', 'we', 'go'),...",[ 1.93020366e-02 2.00337350e-01 -2.47012377e-...
2,An open keyboard and buttons to push. The thin...,0,1,0,1,1,"['an', 'open', 'keyboard', 'and', 'buttons', '...","['an open keyboard and buttons to push.', 'the...","[('an', 'open'), ('open', 'keyboard'), ('keybo...","[('an', 'open', 'keyboard'), ('open', 'keyboar...",[ 1.21683925e-02 1.49960428e-01 -2.17856288e-...
3,I can't believe it! It's really happening! M...,1,0,1,1,0,"['i', 'cant', 'believe', 'it', 'its', 'really'...","[""i can't believe it!"", ""it's really happening...","[('i', 'cant'), ('cant', 'believe'), ('believe...","[('i', 'cant', 'believe'), ('cant', 'believe',...",[-1.21900747e-02 1.94802403e-01 -2.04183444e-...
4,"Well, here I go with the good old stream of co...",1,0,1,0,1,"['well', 'here', 'i', 'go', 'with', 'the', 'go...","['well, here i go with the good old stream of ...","[('well', 'here'), ('here', 'i'), ('i', 'go'),...","[('well', 'here', 'i'), ('here', 'i', 'go'), (...",[-6.53621508e-03 1.72239631e-01 -2.12745324e-...


In [24]:
def cross_validation(data, personality, n_splits, C, penalty):

    kf=KFold(n_splits=n_splits)
    
    lr_f1=[]

    vectorizer=TfidfVectorizer()

    for i, (train_index, test_index) in enumerate(kf.split(data)):

        data_train=data.iloc[train_index]
        data_test=data.iloc[test_index]

        X_train=vectorizer.fit_transform(data_train["TEXT"])
        X_test=vectorizer.transform(data_test["TEXT"])
        y_train=data_train[personality]
        y_test=data_test[personality]

        lr=LR(penalty=penalty, C=C, solver="liblinear")
        lr.fit(X_train, y_train)

        y_pred=lr.predict(X_test)

        lr_f1.append(f1_score(y_pred=y_pred, y_true=y_test, average="macro"))


        #print(f"Fold {i+1} done.")

    #print(f"The {n_splits}-fold cross-validation f1 score for LR is {sum(lr_f1)/len(lr_f1)} for {C}, {penalty}, {personality}.")
    return(sum(lr_f1)/len(lr_f1))

In [25]:
C=[0.01, 0.1, 0.5, 1, 5, 10] 
penalty=["l2","l1"]

params=[C, penalty]

permutations=list(product(*params))

In [26]:
for C, penalty in permutations:
    f1=[]
    for personality in ["cAGR", "cOPN", "cNEU", "cEXT", "cCON"]:
        f1.append(cross_validation(essays, personality, 5, C, penalty))
    print(f"Average f1 is {sum(f1)/len(f1)} for {C} and {penalty}.")

    

Average f1 is 0.38201898719366845 for 0.01 and l2.
Average f1 is 0.32681541149033116 for 0.01 and l1.
Average f1 is 0.5167732919274329 for 0.1 and l2.
Average f1 is 0.3377022602059344 for 0.1 and l1.
Average f1 is 0.5657440750146764 for 0.5 and l2.
Average f1 is 0.5161634329313625 for 0.5 and l1.
Average f1 is 0.5696406546619747 for 1 and l2.
Average f1 is 0.5632382391138311 for 1 and l1.
Average f1 is 0.5680396765071253 for 5 and l2.
Average f1 is 0.5569972475445601 for 5 and l1.
Average f1 is 0.5613123890106448 for 10 and l2.
Average f1 is 0.5468321856176658 for 10 and l1.


In [27]:
def cross_validation(data, personality, n_splits, kernel, C):

    kf=KFold(n_splits=n_splits)
    
    svm_f1=[]

    vectorizer=TfidfVectorizer()

    for i, (train_index, test_index) in enumerate(kf.split(data)):

        data_train=data.iloc[train_index]
        data_test=data.iloc[test_index]

        X_train=vectorizer.fit_transform(data_train["TEXT"])
        X_test=vectorizer.transform(data_test["TEXT"])
        y_train=data_train[personality]
        y_test=data_test[personality]

        svm=SVC(kernel=kernel, C=C)
        svm.fit(X_train, y_train)

        y_pred=svm.predict(X_test)

        svm_f1.append(f1_score(y_pred=y_pred, y_true=y_test, average="macro"))


        #print(f"Fold {i+1} done.")

    #print(f"The {n_splits}-fold cross-validation f1 score for LR is {sum(lr_f1)/len(lr_f1)} for {C}, {penalty}, {personality}.")
    return(sum(svm_f1)/len(svm_f1))

In [30]:
C=[0.01, 0.1, 0.5, 1, 5, 10] 
kernel=["rbf", "linear", "poly"]

params=[C, kernel]

permutations=list(product(*params))

In [31]:
for C, kernel in permutations:
    f1=[]
    for personality in ["cAGR", "cOPN", "cNEU", "cEXT", "cCON"]:
        f1.append(cross_validation(essays, personality, 5, kernel, C))
    print(f"Average f1 is {sum(f1)/len(f1)} for {C} and {kernel}.")

Average f1 is 0.3365113076750258 for 0.01 and rbf.
Average f1 is 0.3365113076750258 for 0.01 and linear.
Average f1 is 0.3365113076750258 for 0.01 and poly.
Average f1 is 0.33727844246220356 for 0.1 and rbf.
Average f1 is 0.3821465562314155 for 0.1 and linear.
Average f1 is 0.3365626223628701 for 0.1 and poly.
Average f1 is 0.5268722589778478 for 0.5 and rbf.
Average f1 is 0.559103911766743 for 0.5 and linear.
Average f1 is 0.49484336993753797 for 0.5 and poly.
Average f1 is 0.5718948735946261 for 1 and rbf.
Average f1 is 0.5716918179863577 for 1 and linear.
Average f1 is 0.5713890395911774 for 1 and poly.
Average f1 is 0.5671157207903124 for 5 and rbf.
Average f1 is 0.5511649453631134 for 5 and linear.
Average f1 is 0.5706999776686417 for 5 and poly.
Average f1 is 0.5671157207903124 for 10 and rbf.
Average f1 is 0.5511114290776096 for 10 and linear.
Average f1 is 0.5706999776686417 for 10 and poly.
