In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

essays=pd.read_csv("../data/essays_expanded.csv")

essays.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jurin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN,words,sentences,bigrams,trigrams,average_word_embedding
0,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1,"['well', 'right', 'now', 'i', 'just', 'woke', ...","['well, right now i just woke up from a mid-da...","[('well', 'right'), ('right', 'now'), ('now', ...","[('well', 'right', 'now'), ('right', 'now', 'i...",[ 1.46904569e-02 1.52049020e-01 -2.17639774e-...
1,"Well, here we go with the stream of consciousn...",0,0,1,0,0,"['well', 'here', 'we', 'go', 'with', 'the', 's...","['well, here we go with the stream of consciou...","[('well', 'here'), ('here', 'we'), ('we', 'go'...","[('well', 'here', 'we'), ('here', 'we', 'go'),...",[ 1.93020366e-02 2.00337350e-01 -2.47012377e-...
2,An open keyboard and buttons to push. The thin...,0,1,0,1,1,"['an', 'open', 'keyboard', 'and', 'buttons', '...","['an open keyboard and buttons to push.', 'the...","[('an', 'open'), ('open', 'keyboard'), ('keybo...","[('an', 'open', 'keyboard'), ('open', 'keyboar...",[ 1.21683925e-02 1.49960428e-01 -2.17856288e-...
3,I can't believe it! It's really happening! M...,1,0,1,1,0,"['i', 'cant', 'believe', 'it', 'its', 'really'...","[""i can't believe it!"", ""it's really happening...","[('i', 'cant'), ('cant', 'believe'), ('believe...","[('i', 'cant', 'believe'), ('cant', 'believe',...",[-1.21900747e-02 1.94802403e-01 -2.04183444e-...
4,"Well, here I go with the good old stream of co...",1,0,1,0,1,"['well', 'here', 'i', 'go', 'with', 'the', 'go...","['well, here i go with the good old stream of ...","[('well', 'here'), ('here', 'i'), ('i', 'go'),...","[('well', 'here', 'i'), ('here', 'i', 'go'), (...",[-6.53621508e-03 1.72239631e-01 -2.12745324e-...


In [2]:
import string
import spacy

nlp=spacy.load("en_core_web_sm")

def lemmatizer(text):
    """
    Receives a string as an input and lemmatizes it.
    The lemmas are returned in a list.
    """
    # YOUR CODE HERE
    #nlp = spacy.load("en_core_web_sm")
    doc=nlp(text)
    lemmas=[]
    for token in doc:
        lemmas.append(token.lemma_)
    return lemmas

def punc_lower_lemma(text):
    """
    Removes punctuation and converts text to lowercase.
    """

    text=" ".join(lemmatizer(text))

    no_punc=text.translate(str.maketrans("","",string.punctuation))

    return no_punc.lower()

In [3]:
essays["lemmas"]=essays.apply(lambda x: punc_lower_lemma(x["TEXT"]), axis=1)

In [4]:
import string

def text_cleaning(text):
    """
    Removes punctuation and stopwords, and converts text to lowercase.
    """

    no_punc=text.translate(str.maketrans("","",string.punctuation))
    
    stop_words = set(stopwords.words('english'))

    word_tokens = word_tokenize(no_punc)

    filtered_tokens=[]

    for token in word_tokens:

        if token.lower() not in stop_words:

            filtered_tokens.append(token)
    
    no_punc=" ".join(filtered_tokens)
    
    return no_punc.lower()

In [5]:
essays["no stop words"]=essays.apply(lambda x: text_cleaning(x["TEXT"]), axis=1)

In [6]:
def punc_lower(text):
    """
    Removes punctuation.
    """

    no_punc=text.translate(str.maketrans("","",string.punctuation))
    
    return no_punc

In [7]:
essays["no punc"]=essays.apply(lambda x: punc_lower(x["TEXT"]), axis=1)

In [8]:
text=["TEXT", "lemmas", "no stop words", "no punc"]
C=[0.1,1,10]
kernel=['rbf', 'linear', 'poly']

In [9]:
from itertools import product

def grid_search(text, C, kernel, data_train, data_test, vectorizer):

    params=[text, C, kernel]

    permutations=list(product(*params))

    best_f1=0
    best_params={}

    for text, C, kernel in permutations:

        X_train=vectorizer.fit_transform(data_train[text])
        X_test=vectorizer.transform(data_test[text])

        f1=[]

        for personality in ["cEXT", "cOPN", "cAGR", "cNEU", "cCON"]:

            y_train=data_train[[personality]]
            y_test=data_test[[personality]]

            model=SVC(C=C, kernel=kernel)

            model.fit(X_train, y_train)

            y_pred=model.predict(X_test)

            f1.append(f1_score(y_pred=y_pred, y_true=y_test, average="macro"))

        print(f"Params: text={text}, C={C}, kernel={kernel},   F1:{f1}   avg_f1={sum(f1)/len(f1)}")
        f1=sum(f1)/len(f1)

        if f1>best_f1:
            print("CURRENT BEST!!")

            best_f1=f1

            best_params={"text":text, "C":C, "kernel":kernel}


    return best_params

In [10]:
essays_train, essays_test = train_test_split(
    essays[["TEXT", "lemmas", "no stop words", "no punc", "cEXT", "cOPN", "cAGR", "cCON", "cNEU"]], test_size=0.2
)

In [52]:
vectorizer=CountVectorizer()

best=grid_search(text, C, kernel, essays_train, essays_test, vectorizer)

  return f(**kwargs)


Params: text=TEXT, C=0.1, kernel=rbf,   F1:[0.34828496042216356, 0.4580117503455286, 0.34308510638297873, 0.46903609800031437, 0.4638491440629229]   avg_f1=0.4164534118427817
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=linear,   F1:[0.5563483562659636, 0.5744523750922963, 0.5251951039185081, 0.5161885009793755, 0.540438750712058]   avg_f1=0.5425246173936402
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=poly,   F1:[0.3805728174109979, 0.44815679902508376, 0.36516517847251456, 0.4185974127030444, 0.3432903803274174]   avg_f1=0.39115651758781167
Params: text=TEXT, C=1, kernel=rbf,   F1:[0.5919970334412081, 0.5809562056657664, 0.5144374655647383, 0.5563483562659637, 0.5664534120734908]   avg_f1=0.5620384946022335
CURRENT BEST!!
Params: text=TEXT, C=1, kernel=linear,   F1:[0.5503745711685221, 0.5662257090336135, 0.5128205128205129, 0.5264753676848709, 0.544517614854094]   avg_f1=0.5400827551123226
Params: text=TEXT, C=1, kernel=poly,   F1:[0.49304039893040397, 0.46482738697096637, 0.4

In [53]:
print(best)

{'text': 'TEXT', 'C': 1, 'kernel': 'rbf'}


In [54]:
vectorizer=TfidfVectorizer()

best=grid_search(text, C, kernel, essays_train, essays_test, vectorizer)

Params: text=TEXT, C=0.1, kernel=rbf,   F1:[0.34828496042216356, 0.33423180592991913, 0.34308510638297873, 0.328804347826087, 0.32971506105834464]   avg_f1=0.3368242563238986
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=linear,   F1:[0.34828496042216356, 0.4806614982118427, 0.34308510638297873, 0.3337793352447701, 0.33757428822759555]   avg_f1=0.3686770376978702
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=poly,   F1:[0.34828496042216356, 0.33423180592991913, 0.34308510638297873, 0.328804347826087, 0.32971506105834464]   avg_f1=0.3368242563238986
Params: text=TEXT, C=1, kernel=rbf,   F1:[0.5992405994915028, 0.6172818757557746, 0.5257296466973886, 0.5744523750922964, 0.5711805555555556]   avg_f1=0.5775770105185035
CURRENT BEST!!
Params: text=TEXT, C=1, kernel=linear,   F1:[0.6139253996447602, 0.6194331983805669, 0.5476015426485573, 0.5745570866141732, 0.550046369899311]   avg_f1=0.5811127194374738
CURRENT BEST!!
Params: text=TEXT, C=1, kernel=poly,   F1:[0.6032623847866343, 0.6120

In [55]:
print(best)

{'text': 'TEXT', 'C': 1, 'kernel': 'poly'}


In [56]:
vectorizer=CountVectorizer(ngram_range=(1,3))

best=grid_search(text, C, kernel, essays_train, essays_test, vectorizer)

Params: text=TEXT, C=0.1, kernel=rbf,   F1:[0.34828496042216356, 0.34573780860276665, 0.34308510638297873, 0.4717853200883002, 0.3978057700121902]   avg_f1=0.38133979310167987
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=linear,   F1:[0.5615627073656271, 0.6153593967707565, 0.5537936571657969, 0.5564602422975217, 0.5524096509029786]   avg_f1=0.5679171309005361
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=poly,   F1:[0.35886328776820775, 0.4119047619047619, 0.3514923663908922, 0.3957627587873175, 0.3399905252119851]   avg_f1=0.3716027400126328
Params: text=TEXT, C=1, kernel=rbf,   F1:[0.5637828175141607, 0.5970850905990892, 0.4888937720888499, 0.5465951234908861, 0.5728754940711462]   avg_f1=0.5538464595528264
Params: text=TEXT, C=1, kernel=linear,   F1:[0.5615627073656271, 0.6153593967707565, 0.5537936571657969, 0.5564602422975217, 0.5524096509029786]   avg_f1=0.5679171309005361
Params: text=TEXT, C=1, kernel=poly,   F1:[0.45417045203427703, 0.476544540229885, 0.4306183897190614,

In [57]:
print(best)

{'text': 'TEXT', 'C': 10, 'kernel': 'rbf'}


In [58]:
vectorizer=TfidfVectorizer(ngram_range=(1,3))

best=grid_search(text, C, kernel, essays_train, essays_test, vectorizer)

Params: text=TEXT, C=0.1, kernel=rbf,   F1:[0.34828496042216356, 0.33423180592991913, 0.34308510638297873, 0.328804347826087, 0.32971506105834464]   avg_f1=0.3368242563238986
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=linear,   F1:[0.34828496042216356, 0.33423180592991913, 0.34308510638297873, 0.328804347826087, 0.32971506105834464]   avg_f1=0.3368242563238986
Params: text=TEXT, C=0.1, kernel=poly,   F1:[0.34828496042216356, 0.33423180592991913, 0.34308510638297873, 0.328804347826087, 0.32971506105834464]   avg_f1=0.3368242563238986
Params: text=TEXT, C=1, kernel=rbf,   F1:[0.4456535922127404, 0.6147552144581296, 0.34308510638297873, 0.5824175824175823, 0.4202077246052987]   avg_f1=0.481223844015346
CURRENT BEST!!
Params: text=TEXT, C=1, kernel=linear,   F1:[0.5711420446041011, 0.6293366828754658, 0.47494649946856937, 0.577243714177176, 0.5112018881626725]   avg_f1=0.5527741658575971
CURRENT BEST!!
Params: text=TEXT, C=1, kernel=poly,   F1:[0.34828496042216356, 0.438691828870665, 

In [59]:
print(best)

{'text': 'no punc', 'C': 10, 'kernel': 'linear'}


In [60]:
vectorizer=CountVectorizer(ngram_range=(1,3), max_features=4000)

best=grid_search(text, C, kernel, essays_train, essays_test, vectorizer)

Params: text=TEXT, C=0.1, kernel=rbf,   F1:[0.34828496042216356, 0.4515668140482304, 0.34308510638297873, 0.4765947125879214, 0.46278990169446]   avg_f1=0.4164642990271508
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=linear,   F1:[0.4968641114982579, 0.5748360655737705, 0.5153508771929824, 0.525374458021285, 0.5242739532920537]   avg_f1=0.52733989311567
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=poly,   F1:[0.3805728174109979, 0.4412822616268064, 0.36965780097771145, 0.40924150640083123, 0.34553025045738434]   avg_f1=0.38925692737474626
Params: text=TEXT, C=1, kernel=rbf,   F1:[0.5919970334412081, 0.5931024026029907, 0.5181243359055201, 0.5520050125313284, 0.5848824578303375]   avg_f1=0.568022248462277
CURRENT BEST!!
Params: text=TEXT, C=1, kernel=linear,   F1:[0.4968641114982579, 0.5666879805879363, 0.5153508771929824, 0.525374458021285, 0.5283226856044618]   avg_f1=0.5265200225809847
Params: text=TEXT, C=1, kernel=poly,   F1:[0.47449830372031515, 0.4603550295857988, 0.4293279

In [61]:
print(best)

{'text': 'TEXT', 'C': 1, 'kernel': 'rbf'}


In [62]:
vectorizer=TfidfVectorizer(ngram_range=(1,3), max_features=4000)

best=grid_search(text, C, kernel, essays_train, essays_test, vectorizer)

Params: text=TEXT, C=0.1, kernel=rbf,   F1:[0.34828496042216356, 0.33423180592991913, 0.34308510638297873, 0.328804347826087, 0.32971506105834464]   avg_f1=0.3368242563238986
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=linear,   F1:[0.34828496042216356, 0.4627042893035759, 0.34308510638297873, 0.353863217813494, 0.32971506105834464]   avg_f1=0.36753052699611133
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=poly,   F1:[0.34828496042216356, 0.33423180592991913, 0.34308510638297873, 0.328804347826087, 0.32971506105834464]   avg_f1=0.3368242563238986
Params: text=TEXT, C=1, kernel=rbf,   F1:[0.5894736842105264, 0.603232362968791, 0.5187922865013774, 0.5969661562429533, 0.5663934527562463]   avg_f1=0.574971588535979
CURRENT BEST!!
Params: text=TEXT, C=1, kernel=linear,   F1:[0.5615360139333143, 0.6194331983805669, 0.5234146850229454, 0.5888649734538671, 0.5693005689824017]   avg_f1=0.5725098879546191
Params: text=TEXT, C=1, kernel=poly,   F1:[0.5983086680761099, 0.6092717131604724, 0.

In [63]:
print(best)

{'text': 'TEXT', 'C': 10, 'kernel': 'poly'}


In [64]:
vectorizer=TfidfVectorizer(max_features=4000)

best=grid_search(text, C, kernel, essays_train, essays_test, vectorizer)

Params: text=TEXT, C=0.1, kernel=rbf,   F1:[0.34828496042216356, 0.3378160418482999, 0.34308510638297873, 0.328804347826087, 0.32971506105834464]   avg_f1=0.3375411035075747
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=linear,   F1:[0.34828496042216356, 0.48673997564725185, 0.34308510638297873, 0.3548918316507146, 0.3341312974465148]   avg_f1=0.37342663430992473
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=poly,   F1:[0.34828496042216356, 0.34573780860276665, 0.34308510638297873, 0.3332042698502326, 0.32971506105834464]   avg_f1=0.34000544126329724
Params: text=TEXT, C=1, kernel=rbf,   F1:[0.6110835725026764, 0.6092332788222323, 0.544201848785872, 0.5620351672166675, 0.563056669752134]   avg_f1=0.5779221074159164
CURRENT BEST!!
Params: text=TEXT, C=1, kernel=linear,   F1:[0.6079561592568459, 0.6173947964774389, 0.5441846093102901, 0.5624015748031496, 0.5663934527562463]   avg_f1=0.5796661185207942
CURRENT BEST!!
Params: text=TEXT, C=1, kernel=poly,   F1:[0.6241940512564693, 0.621

In [65]:
print(best)

{'text': 'TEXT', 'C': 1, 'kernel': 'poly'}


In [66]:
vectorizer=CountVectorizer(max_features=4000)

best=grid_search(text, C, kernel, essays_train, essays_test, vectorizer)

Params: text=TEXT, C=0.1, kernel=rbf,   F1:[0.34828496042216356, 0.46351554015617724, 0.34308510638297873, 0.4681865955210608, 0.46223627598361927]   avg_f1=0.41706169569319995
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=linear,   F1:[0.5546929988845304, 0.5777777777777777, 0.5070285453806256, 0.5235867446393763, 0.5404688825416859]   avg_f1=0.5407109898447992
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=poly,   F1:[0.3850035254715318, 0.45019874250198744, 0.36965780097771145, 0.4185974127030444, 0.34132291946644744]   avg_f1=0.3929560802241445
Params: text=TEXT, C=1, kernel=rbf,   F1:[0.5925533168375161, 0.5809699428360685, 0.5155478352646277, 0.5522781576995963, 0.5665458481623014]   avg_f1=0.561579020160022
CURRENT BEST!!
Params: text=TEXT, C=1, kernel=linear,   F1:[0.5412393162393162, 0.5698348804731783, 0.5070285453806256, 0.5190110553754761, 0.5383934426229507]   avg_f1=0.5351014480183094
Params: text=TEXT, C=1, kernel=poly,   F1:[0.5111237986487772, 0.46181549709181924, 0

In [67]:
print(best)

{'text': 'TEXT', 'C': 1, 'kernel': 'rbf'}


In [68]:
vectorizer=CountVectorizer(max_features=400)

best=grid_search(text, C, kernel, essays_train, essays_test, vectorizer)

Params: text=TEXT, C=0.1, kernel=rbf,   F1:[0.34828496042216356, 0.48098060344827587, 0.34308510638297873, 0.4756393069646082, 0.4645777909153611]   avg_f1=0.4225135536266775
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=linear,   F1:[0.5199460771543294, 0.5524096509029786, 0.5448058146117325, 0.5138831942007118, 0.5262459016393444]   avg_f1=0.5314581277018193
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=poly,   F1:[0.3948834688346883, 0.4592252994011976, 0.3785480814129152, 0.42314749941614116, 0.38273619788346297]   avg_f1=0.407708109389681
Params: text=TEXT, C=1, kernel=rbf,   F1:[0.5679993305299273, 0.5708220637652651, 0.5293703099510604, 0.5300295291399483, 0.55406381302521]   avg_f1=0.5504570092822823
CURRENT BEST!!
Params: text=TEXT, C=1, kernel=linear,   F1:[0.5218010027633909, 0.5626383355467022, 0.5479323308270677, 0.5195321637426901, 0.524195961260231]   avg_f1=0.5352199588280164
Params: text=TEXT, C=1, kernel=poly,   F1:[0.5008233532934132, 0.4661518938527696, 0.434370

KeyboardInterrupt: 

In [None]:
print(best)

In [None]:
vectorizer=TfidfVectorizer(max_features=400)

best=grid_search(text, C, kernel, essays_train, essays_test, vectorizer)

In [None]:
print(best)

In [11]:
vectorizer=TfidfVectorizer(ngram_range=(1,2))#, max_features=4000)

best=grid_search(text, C, kernel, essays_train, essays_test, vectorizer)

  return f(**kwargs)


Params: text=TEXT, C=0.1, kernel=rbf,   F1:[0.3232876712328767, 0.3378016085790885, 0.34828496042216356, 0.3138888888888889, 0.34828496042216356]   avg_f1=0.33430961790903624
CURRENT BEST!!
Params: text=TEXT, C=0.1, kernel=linear,   F1:[0.3232876712328767, 0.3378016085790885, 0.34828496042216356, 0.3138888888888889, 0.34828496042216356]   avg_f1=0.33430961790903624
Params: text=TEXT, C=0.1, kernel=poly,   F1:[0.3232876712328767, 0.3378016085790885, 0.34828496042216356, 0.3138888888888889, 0.34828496042216356]   avg_f1=0.33430961790903624
Params: text=TEXT, C=1, kernel=rbf,   F1:[0.49490855760200236, 0.6072295081967213, 0.4343702502786503, 0.5404839469748192, 0.5052218874126831]   avg_f1=0.5164428300929752
CURRENT BEST!!
Params: text=TEXT, C=1, kernel=linear,   F1:[0.5506842456608811, 0.615378311041175, 0.5032604708370196, 0.5707196131970956, 0.5477417604774176]   avg_f1=0.5575568802427179
CURRENT BEST!!
Params: text=TEXT, C=1, kernel=poly,   F1:[0.3232876712328767, 0.5924751375260646, 

KeyboardInterrupt: 