## Data loading

In [50]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

essays=pd.read_csv("../data/essays_expanded.csv")

essays.head()

Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN,words,sentences,bigrams,trigrams,average_word_embedding
0,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1,"['well', 'right', 'now', 'i', 'just', 'woke', ...","['well, right now i just woke up from a mid-da...","[('well', 'right'), ('right', 'now'), ('now', ...","[('well', 'right', 'now'), ('right', 'now', 'i...",[ 1.46904569e-02 1.52049020e-01 -2.17639774e-...
1,"Well, here we go with the stream of consciousn...",0,0,1,0,0,"['well', 'here', 'we', 'go', 'with', 'the', 's...","['well, here we go with the stream of consciou...","[('well', 'here'), ('here', 'we'), ('we', 'go'...","[('well', 'here', 'we'), ('here', 'we', 'go'),...",[ 1.93020366e-02 2.00337350e-01 -2.47012377e-...
2,An open keyboard and buttons to push. The thin...,0,1,0,1,1,"['an', 'open', 'keyboard', 'and', 'buttons', '...","['an open keyboard and buttons to push.', 'the...","[('an', 'open'), ('open', 'keyboard'), ('keybo...","[('an', 'open', 'keyboard'), ('open', 'keyboar...",[ 1.21683925e-02 1.49960428e-01 -2.17856288e-...
3,I can't believe it! It's really happening! M...,1,0,1,1,0,"['i', 'cant', 'believe', 'it', 'its', 'really'...","[""i can't believe it!"", ""it's really happening...","[('i', 'cant'), ('cant', 'believe'), ('believe...","[('i', 'cant', 'believe'), ('cant', 'believe',...",[-1.21900747e-02 1.94802403e-01 -2.04183444e-...
4,"Well, here I go with the good old stream of co...",1,0,1,0,1,"['well', 'here', 'i', 'go', 'with', 'the', 'go...","['well, here i go with the good old stream of ...","[('well', 'here'), ('here', 'i'), ('i', 'go'),...","[('well', 'here', 'i'), ('here', 'i', 'go'), (...",[-6.53621508e-03 1.72239631e-01 -2.12745324e-...


## Baseline classifier: random

In [51]:
def baseline_random_classifier(df_test, personality):

    y_test = df_test[[personality]]
    y_pred = np.random.randint(2, size=len(y_test))
    print(classification_report(y_pred=y_pred, y_true=y_test))
    return f1_score(y_pred=y_pred, y_true=y_test, average="macro")

## LR and SVM training and test functions

In [52]:
def train_lr(df_train, vectorizer, personality, lr_kwargs={"max_iter": 1000, "solver": "liblinear"}):
    """
    Receives the train set `df_train` as pd.DataFrame and extracts lemma n-grams
    with their correspoding labels (news type).
    The text is vectorized and used to train a logistic regression with
    training arguments passed as `lr_kwargs`.
    Returns the fitted model.
    """
    vectorizer.set_params(max_df=df_train.shape[0])
    X=vectorizer.fit_transform(df_train.TEXT)
    model=LR(**lr_kwargs)
    model.fit(X, df_train[[personality]])
    return model


def train_svm(df_train, vectorizer, personality):
    """
    Receives the train set `df_train` as pd.DataFrame and extracts lemma n-grams
    with their correspoding labels (news type).
    The text is vectorized and used to train a logistic regression with
    training arguments passed as `lr_kwargs`.
    Returns the fitted model.
    """
    vectorizer.set_params(max_df=df_train.shape[0])
    X=vectorizer.fit_transform(df_train.TEXT)
    model=SVC(kernel="linear")
    model.fit(X, df_train[[personality]])
    return model


def test_performance(model, df_test, vectorizer, personality):

    X_test, y_test = df_test.TEXT, df_test[[personality]]
    X_vec = vectorizer.transform(X_test)
    y_pred = model.predict(X_vec)
    print(classification_report(y_pred=y_pred, y_true=y_test))
    return f1_score(y_pred=y_pred, y_true=y_test, average="macro")


def influential_ngrams(model, vectorizer, is_lr=True):
    """
    Receives a model (LR or SVM) and a vectorizer.
    Prints the most influential n-grams.
    """

    if is_lr:
        print("Logistic regression\n")
        print("The most influential n-grams for classification 1 are:")
        ind = np.argsort(model.coef_)[0][-10:]
        for index in ind:
            print(vectorizer.get_feature_names()[index])


        print("The most influential n-grams for classification 0 are:")
        ind = np.argsort(model.coef_)[0][:10]
        for index in ind:
            print(vectorizer.get_feature_names()[index])

    else:
        print("SVM\n")
        print("The most influential n-grams for classification 1 are:")
        ind = np.argsort(svm.coef_.toarray())[0][-10:]
        for index in ind:
            print(count_vectorizer.get_feature_names()[index])

        print("The most influential n-grams for classification 0 are:")
        ind = np.argsort(svm.coef_.toarray())[0][:10]
        for index in ind:
            print(count_vectorizer.get_feature_names()[index])


## Experiment 1: Raw text n-grams\\\\
The n-grams will be extracted out of the raw essay text and given to the models as features.


In [53]:
essays_train, essays_test = train_test_split(
    essays[["TEXT", "cEXT", "cOPN", "cAGR", "cCON", "cNEU"]], test_size=0.2, random_state=42
)

count_vectorizer = CountVectorizer(lowercase=False, ngram_range=(1,3), analyzer="word")

#### Baseline

In [54]:
f1=baseline_random_classifier(essays_test, "cEXT")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.50      0.52      0.51       227
           1       0.57      0.55      0.56       267

    accuracy                           0.54       494
   macro avg       0.54      0.54      0.54       494
weighted avg       0.54      0.54      0.54       494

f1 = 0.535


In [55]:
f1=baseline_random_classifier(essays_test, "cOPN")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.53      0.62      0.57       237
           1       0.58      0.49      0.53       257

    accuracy                           0.55       494
   macro avg       0.56      0.56      0.55       494
weighted avg       0.56      0.55      0.55       494

f1 = 0.552


In [56]:
f1=baseline_random_classifier(essays_test, "cAGR")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.46      0.53      0.50       220
           1       0.57      0.51      0.54       274

    accuracy                           0.52       494
   macro avg       0.52      0.52      0.52       494
weighted avg       0.53      0.52      0.52       494

f1 = 0.517


In [57]:
f1=baseline_random_classifier(essays_test, "cCON")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.43      0.46      0.44       227
           1       0.51      0.49      0.50       267

    accuracy                           0.47       494
   macro avg       0.47      0.47      0.47       494
weighted avg       0.48      0.47      0.47       494

f1 = 0.472


In [58]:
f1=baseline_random_classifier(essays_test, "cNEU")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.47      0.42      0.44       260
           1       0.42      0.47      0.45       234

    accuracy                           0.45       494
   macro avg       0.45      0.45      0.45       494
weighted avg       0.45      0.45      0.45       494

f1 = 0.445


#### Extraversion

In [59]:
lr = train_lr(essays_train, count_vectorizer, "cEXT")
f1 = test_performance(lr, essays_test, count_vectorizer, "cEXT")
print(f"f1 = {f1:.3f}")

  return f(**kwargs)


              precision    recall  f1-score   support

           0       0.52      0.53      0.53       227
           1       0.60      0.58      0.59       267

    accuracy                           0.56       494
   macro avg       0.56      0.56      0.56       494
weighted avg       0.56      0.56      0.56       494

f1 = 0.559


In [60]:
influential_ngrams(lr, count_vectorizer)

Logistic regression

The most influential n-grams for classification 1 are:
some of
many
if
college
four
love
myself
first
boyfriend
fun
The most influential n-grams for classification 0 are:
should
real
there
could
don
few
So
eyes
even though
three


In [61]:
svm = train_svm(essays_train, count_vectorizer, "cEXT")
f1 = test_performance(svm, essays_test, count_vectorizer, "cEXT")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.51      0.54      0.53       227
           1       0.59      0.57      0.58       267

    accuracy                           0.55       494
   macro avg       0.55      0.55      0.55       494
weighted avg       0.56      0.55      0.56       494

f1 = 0.553


In [62]:
influential_ngrams(svm, count_vectorizer, is_lr=False)

SVM

The most influential n-grams for classification 1 are:
if
some of
many
college
four
first
myself
love
boyfriend
fun
The most influential n-grams for classification 0 are:
real
should
So
could
few
there
three
don
eyes
even though


### Openness

In [63]:
lr = train_lr(essays_train, count_vectorizer, "cOPN")
f1 = test_performance(lr, essays_test, count_vectorizer, "cOPN")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.58      0.62      0.60       237
           1       0.63      0.59      0.61       257

    accuracy                           0.61       494
   macro avg       0.61      0.61      0.61       494
weighted avg       0.61      0.61      0.61       494

f1 = 0.605


In [64]:
influential_ngrams(lr, count_vectorizer)

Logistic regression

The most influential n-grams for classification 1 are:
myself
now that
my mom
crazy
re
music
is so
to my
great
maybe
The most influential n-grams for classification 0 are:
college
problem
to do
classes
time to
thinks
Well
assignment
home
tomorrow


In [65]:
svm = train_svm(essays_train, count_vectorizer, "cOPN")
f1 = test_performance(svm, essays_test, count_vectorizer, "cOPN")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.57      0.59      0.58       237
           1       0.61      0.59      0.60       257

    accuracy                           0.59       494
   macro avg       0.59      0.59      0.59       494
weighted avg       0.59      0.59      0.59       494

f1 = 0.593


In [66]:
influential_ngrams(svm, count_vectorizer, is_lr=False)

SVM

The most influential n-grams for classification 1 are:
music
re
great
love him
crazy
now that
my mom
is so
to my
maybe
The most influential n-grams for classification 0 are:
college
problem
to do
thinks
time to
classes
Well
tomorrow
confused
assignment


#### Aggression

In [67]:
lr = train_lr(essays_train, count_vectorizer, "cAGR")
f1 = test_performance(lr, essays_test, count_vectorizer, "cAGR")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.45      0.44      0.44       220
           1       0.56      0.57      0.56       274

    accuracy                           0.51       494
   macro avg       0.50      0.50      0.50       494
weighted avg       0.51      0.51      0.51       494

f1 = 0.503


In [68]:
influential_ngrams(lr, count_vectorizer)

Logistic regression

The most influential n-grams for classification 1 are:
Well
worried
ok
least
have to
my mind
right
and have
family
and the
The most influential n-grams for classification 0 are:
and it
And
think that
stupid
girlfriend
read
more
same
is no
assignment


In [69]:
svm = train_svm(essays_train, count_vectorizer, "cAGR")
f1 = test_performance(svm, essays_test, count_vectorizer, "cAGR")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.45      0.44      0.44       220
           1       0.56      0.57      0.56       274

    accuracy                           0.51       494
   macro avg       0.50      0.50      0.50       494
weighted avg       0.51      0.51      0.51       494

f1 = 0.503


In [70]:
influential_ngrams(svm, count_vectorizer, is_lr=False)

SVM

The most influential n-grams for classification 1 are:
sometimes
worried
right
least
have to
my mind
family
ok
and have
and the
The most influential n-grams for classification 0 are:
and it
And
stupid
think that
sitting
is no
girlfriend
store
write about
more


#### Conscientiousness

In [71]:
lr = train_lr(essays_train, count_vectorizer, "cCON")
f1 = test_performance(lr, essays_test, count_vectorizer, "cCON")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.52      0.55      0.53       227
           1       0.60      0.58      0.59       267

    accuracy                           0.56       494
   macro avg       0.56      0.56      0.56       494
weighted avg       0.56      0.56      0.56       494

f1 = 0.561


In [72]:
influential_ngrams(lr, count_vectorizer)

Logistic regression

The most influential n-grams for classification 1 are:
tonight
everyone
do not
For
party
since
we are
of my
couldn
sometimes
The most influential n-grams for classification 0 are:
hate
my friend
is it
you know
because my
not to
want
they re
am in
Austin


In [73]:
svm = train_svm(essays_train, count_vectorizer, "cCON")
f1 = test_performance(svm, essays_test, count_vectorizer, "cCON")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.52      0.56      0.54       227
           1       0.60      0.57      0.58       267

    accuracy                           0.56       494
   macro avg       0.56      0.56      0.56       494
weighted avg       0.56      0.56      0.56       494

f1 = 0.562


In [74]:
influential_ngrams(svm, count_vectorizer, is_lr=False)

SVM

The most influential n-grams for classification 1 are:
week
If
do not
since
party
of my
For
we are
couldn
sometimes
The most influential n-grams for classification 0 are:
hate
my friend
is it
because my
am in
not to
Austin
you know
Is
they re


#### Neuroticism

In [75]:
lr = train_lr(essays_train, count_vectorizer, "cNEU")
f1 = test_performance(lr, essays_test, count_vectorizer, "cNEU")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.63      0.57      0.60       260
           1       0.57      0.63      0.60       234

    accuracy                           0.60       494
   macro avg       0.60      0.60      0.60       494
weighted avg       0.60      0.60      0.60       494

f1 = 0.599


In [76]:
influential_ngrams(lr, count_vectorizer)

Logistic regression

The most influential n-grams for classification 1 are:
to visit
everything
being
someone
have to
life
money
sex
week
stressed
The most influential n-grams for classification 0 are:
me and
many
in the
for my
it has
already
its
semester
would
usually


In [77]:
svm = train_svm(essays_train, count_vectorizer, "cNEU")
f1 = test_performance(svm, essays_test, count_vectorizer, "cNEU")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.62      0.56      0.59       260
           1       0.56      0.62      0.59       234

    accuracy                           0.59       494
   macro avg       0.59      0.59      0.59       494
weighted avg       0.59      0.59      0.59       494

f1 = 0.589


In [78]:
influential_ngrams(svm, count_vectorizer, is_lr=False)

SVM

The most influential n-grams for classification 1 are:
worry
quite
someone
have to
to visit
life
money
week
sex
stressed
The most influential n-grams for classification 0 are:
me and
many
for my
it has
in the
already
semester
thing
its
usually
