## Data loading

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

essays=pd.read_csv("../data/essays_expanded.csv")

essays.head()

Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN,words,sentences,bigrams,trigrams,average_word_embedding
0,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1,"['well', 'right', 'now', 'i', 'just', 'woke', ...","['well, right now i just woke up from a mid-da...","[('well', 'right'), ('right', 'now'), ('now', ...","[('well', 'right', 'now'), ('right', 'now', 'i...",[ 1.46904569e-02 1.52049020e-01 -2.17639774e-...
1,"Well, here we go with the stream of consciousn...",0,0,1,0,0,"['well', 'here', 'we', 'go', 'with', 'the', 's...","['well, here we go with the stream of consciou...","[('well', 'here'), ('here', 'we'), ('we', 'go'...","[('well', 'here', 'we'), ('here', 'we', 'go'),...",[ 1.93020366e-02 2.00337350e-01 -2.47012377e-...
2,An open keyboard and buttons to push. The thin...,0,1,0,1,1,"['an', 'open', 'keyboard', 'and', 'buttons', '...","['an open keyboard and buttons to push.', 'the...","[('an', 'open'), ('open', 'keyboard'), ('keybo...","[('an', 'open', 'keyboard'), ('open', 'keyboar...",[ 1.21683925e-02 1.49960428e-01 -2.17856288e-...
3,I can't believe it! It's really happening! M...,1,0,1,1,0,"['i', 'cant', 'believe', 'it', 'its', 'really'...","[""i can't believe it!"", ""it's really happening...","[('i', 'cant'), ('cant', 'believe'), ('believe...","[('i', 'cant', 'believe'), ('cant', 'believe',...",[-1.21900747e-02 1.94802403e-01 -2.04183444e-...
4,"Well, here I go with the good old stream of co...",1,0,1,0,1,"['well', 'here', 'i', 'go', 'with', 'the', 'go...","['well, here i go with the good old stream of ...","[('well', 'here'), ('here', 'i'), ('i', 'go'),...","[('well', 'here', 'i'), ('here', 'i', 'go'), (...",[-6.53621508e-03 1.72239631e-01 -2.12745324e-...


## Baseline classifier: random

In [2]:
def baseline_random_classifier(df_test, personality):

    y_test = df_test[[personality]]
    y_pred = np.random.randint(2, size=len(y_test))
    print(classification_report(y_pred=y_pred, y_true=y_test))
    return f1_score(y_pred=y_pred, y_true=y_test, average="macro")

## LR and SVM training and test functions

In [3]:
def train_lr(df_train, vectorizer, personality, lr_kwargs={"solver": "liblinear"}):
    """
    Receives the train set `df_train` as pd.DataFrame and extracts lemma n-grams
    with their correspoding labels (news type).
    The text is vectorized and used to train a logistic regression with
    training arguments passed as `lr_kwargs`.
    Returns the fitted model.
    """
    vectorizer.set_params(max_df=df_train.shape[0])
    X=vectorizer.fit_transform(df_train.TEXT)
    model=LR(**lr_kwargs)
    model.fit(X, df_train[[personality]])
    return model


def train_svm(df_train, vectorizer, personality):
    """
    Receives the train set `df_train` as pd.DataFrame and extracts lemma n-grams
    with their correspoding labels (news type).
    The text is vectorized and used to train a logistic regression with
    training arguments passed as `lr_kwargs`.
    Returns the fitted model.
    """
    vectorizer.set_params(max_df=df_train.shape[0])
    X=vectorizer.fit_transform(df_train.TEXT)
    model=SVC(kernel="linear")
    model.fit(X, df_train[[personality]])
    return model


def test_performance(model, df_test, vectorizer, personality):

    X_test, y_test = df_test.TEXT, df_test[[personality]]
    X_vec = vectorizer.transform(X_test)
    y_pred = model.predict(X_vec)
    print(classification_report(y_pred=y_pred, y_true=y_test))
    return f1_score(y_pred=y_pred, y_true=y_test, average="macro")


def influential_ngrams(model, vectorizer, is_lr=True):
    """
    Receives a model (LR or SVM) and a vectorizer.
    Prints the most influential n-grams.
    """

    if is_lr:
        print("Logistic regression\n")
        print("The most influential n-grams for classification 1 are:")
        ind = np.argsort(model.coef_)[0][-10:]
        for index in ind:
            print(vectorizer.get_feature_names()[index])


        print("The most influential n-grams for classification 0 are:")
        ind = np.argsort(model.coef_)[0][:10]
        for index in ind:
            print(vectorizer.get_feature_names()[index])

    else:
        print("SVM\n")
        print("The most influential n-grams for classification 1 are:")
        ind = np.argsort(svm.coef_.toarray())[0][-10:]
        for index in ind:
            print(count_vectorizer.get_feature_names()[index])

        print("The most influential n-grams for classification 0 are:")
        ind = np.argsort(svm.coef_.toarray())[0][:10]
        for index in ind:
            print(count_vectorizer.get_feature_names()[index])


## Experiment 1: Raw text n-grams


The n-grams will be extracted out of the raw essay text and given to the models as features.


In [4]:
essays_train, essays_test = train_test_split(
    essays[["TEXT", "cEXT", "cOPN", "cAGR", "cCON", "cNEU"]], test_size=0.2, random_state=42
)

count_vectorizer = TfidfVectorizer()

#### Extraversion

In [5]:
lr = train_lr(essays_train, count_vectorizer, "cEXT")
f1 = test_performance(lr, essays_test, count_vectorizer, "cEXT")
print(f"f1 = {f1:.3f}")

  return f(**kwargs)


              precision    recall  f1-score   support

           0       0.50      0.48      0.49       227
           1       0.57      0.60      0.59       267

    accuracy                           0.54       494
   macro avg       0.54      0.54      0.54       494
weighted avg       0.54      0.54      0.54       494

f1 = 0.539


In [6]:
influential_ngrams(lr, count_vectorizer)

Logistic regression

The most influential n-grams for classification 1 are:
is
its
sorority
and
love
boyfriend
fun
all
am
so
The most influential n-grams for classification 0 are:
don
there
in
should
want
something
eyes
perhaps
very
mother


In [7]:
svm = train_svm(essays_train, count_vectorizer, "cEXT")
f1 = test_performance(svm, essays_test, count_vectorizer, "cEXT")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.51      0.47      0.49       227
           1       0.58      0.62      0.60       267

    accuracy                           0.55       494
   macro avg       0.55      0.54      0.54       494
weighted avg       0.55      0.55      0.55       494

f1 = 0.544


In [8]:
influential_ngrams(svm, count_vectorizer, is_lr=False)

SVM

The most influential n-grams for classification 1 are:
ready
mean
if
love
its
all
am
boyfriend
fun
so
The most influential n-grams for classification 0 are:
don
should
in
something
there
want
eyes
very
perhaps
real


### Openness

In [9]:
lr = train_lr(essays_train, count_vectorizer, "cOPN")
f1 = test_performance(lr, essays_test, count_vectorizer, "cOPN")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.58      0.59      0.58       237
           1       0.62      0.60      0.61       257

    accuracy                           0.60       494
   macro avg       0.60      0.60      0.60       494
weighted avg       0.60      0.60      0.60       494

f1 = 0.597


In [10]:
influential_ngrams(lr, count_vectorizer)

Logistic regression

The most influential n-grams for classification 1 are:
like
cat
ll
too
maybe
re
love
music
of
you
The most influential n-grams for classification 0 are:
college
is
to
my
because
school
home
class
have
classes


In [11]:
svm = train_svm(essays_train, count_vectorizer, "cOPN")
f1 = test_performance(svm, essays_test, count_vectorizer, "cOPN")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.56      0.61      0.59       237
           1       0.61      0.56      0.58       257

    accuracy                           0.59       494
   macro avg       0.59      0.59      0.59       494
weighted avg       0.59      0.59      0.58       494

f1 = 0.585


In [12]:
influential_ngrams(svm, count_vectorizer, is_lr=False)

SVM

The most influential n-grams for classification 1 are:
crazy
love
like
maybe
you
ll
cat
of
re
music
The most influential n-grams for classification 0 are:
college
because
is
boyfriend
assignment
class
home
tomorrow
game
confused


#### Agreeableness

In [13]:
lr = train_lr(essays_train, count_vectorizer, "cAGR")
f1 = test_performance(lr, essays_test, count_vectorizer, "cAGR")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.49      0.38      0.42       220
           1       0.58      0.68      0.62       274

    accuracy                           0.54       494
   macro avg       0.53      0.53      0.52       494
weighted avg       0.54      0.54      0.53       494

f1 = 0.524


In [14]:
influential_ngrams(lr, count_vectorizer)

Logistic regression

The most influential n-grams for classification 1 are:
least
would
with
right
on
to
really
so
family
have
The most influential n-grams for classification 0 are:
stupid
girlfriend
don
is
damn
read
more
nothing
same
no


In [15]:
svm = train_svm(essays_train, count_vectorizer, "cAGR")
f1 = test_performance(svm, essays_test, count_vectorizer, "cAGR")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.47      0.38      0.42       220
           1       0.57      0.65      0.61       274

    accuracy                           0.53       494
   macro avg       0.52      0.52      0.51       494
weighted avg       0.52      0.53      0.52       494

f1 = 0.514


In [16]:
influential_ngrams(svm, count_vectorizer, is_lr=False)

SVM

The most influential n-grams for classification 1 are:
many
worried
would
least
so
right
family
on
with
have
The most influential n-grams for classification 0 are:
stupid
girlfriend
is
read
damn
store
same
don
nothing
wont


#### Conscientiousness

In [17]:
lr = train_lr(essays_train, count_vectorizer, "cCON")
f1 = test_performance(lr, essays_test, count_vectorizer, "cCON")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.50      0.52      0.51       227
           1       0.58      0.57      0.57       267

    accuracy                           0.54       494
   macro avg       0.54      0.54      0.54       494
weighted avg       0.54      0.54      0.54       494

f1 = 0.540


In [18]:
influential_ngrams(lr, count_vectorizer)

Logistic regression

The most influential n-grams for classification 1 are:
today
it
tonight
hope
party
and
the
my
he
to
The most influential n-grams for classification 0 are:
want
hate
don
this
think
re
wake
god
point
chance


In [19]:
svm = train_svm(essays_train, count_vectorizer, "cCON")
f1 = test_performance(svm, essays_test, count_vectorizer, "cCON")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.52      0.55      0.53       227
           1       0.59      0.57      0.58       267

    accuracy                           0.56       494
   macro avg       0.56      0.56      0.56       494
weighted avg       0.56      0.56      0.56       494

f1 = 0.555


In [20]:
influential_ngrams(svm, count_vectorizer, is_lr=False)

SVM

The most influential n-grams for classification 1 are:
student
decision
today
couldn
tonight
my
hope
to
he
party
The most influential n-grams for classification 0 are:
want
hate
this
wake
point
re
think
chance
don
music


#### Neuroticism

In [21]:
lr = train_lr(essays_train, count_vectorizer, "cNEU")
f1 = test_performance(lr, essays_test, count_vectorizer, "cNEU")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.64      0.58      0.61       260
           1       0.58      0.63      0.60       234

    accuracy                           0.61       494
   macro avg       0.61      0.61      0.61       494
weighted avg       0.61      0.61      0.61       494

f1 = 0.605


In [22]:
influential_ngrams(lr, count_vectorizer)

Logistic regression

The most influential n-grams for classification 1 are:
don
feel
everything
want
scared
money
me
sex
life
stressed
The most influential n-grams for classification 0 are:
its
would
her
many
semester
already
beat
mind
as
texas


In [23]:
svm = train_svm(essays_train, count_vectorizer, "cNEU")
f1 = test_performance(svm, essays_test, count_vectorizer, "cNEU")
print(f"f1 = {f1:.3f}")

              precision    recall  f1-score   support

           0       0.62      0.57      0.60       260
           1       0.56      0.62      0.59       234

    accuracy                           0.59       494
   macro avg       0.59      0.59      0.59       494
weighted avg       0.60      0.59      0.59       494

f1 = 0.593


In [24]:
influential_ngrams(svm, count_vectorizer, is_lr=False)

SVM

The most influential n-grams for classification 1 are:
worry
this
scared
me
everything
life
boyfriend
sex
money
stressed
The most influential n-grams for classification 0 are:
its
many
her
would
already
pledge
semester
beat
glad
mind
