# Data loading

In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

essays=pd.read_csv("../data/essays_expanded.csv")

essays.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jurin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN,words,sentences,bigrams,trigrams,average_word_embedding
0,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1,"['well', 'right', 'now', 'i', 'just', 'woke', ...","['well, right now i just woke up from a mid-da...","[('well', 'right'), ('right', 'now'), ('now', ...","[('well', 'right', 'now'), ('right', 'now', 'i...",[ 1.46904569e-02 1.52049020e-01 -2.17639774e-...
1,"Well, here we go with the stream of consciousn...",0,0,1,0,0,"['well', 'here', 'we', 'go', 'with', 'the', 's...","['well, here we go with the stream of consciou...","[('well', 'here'), ('here', 'we'), ('we', 'go'...","[('well', 'here', 'we'), ('here', 'we', 'go'),...",[ 1.93020366e-02 2.00337350e-01 -2.47012377e-...
2,An open keyboard and buttons to push. The thin...,0,1,0,1,1,"['an', 'open', 'keyboard', 'and', 'buttons', '...","['an open keyboard and buttons to push.', 'the...","[('an', 'open'), ('open', 'keyboard'), ('keybo...","[('an', 'open', 'keyboard'), ('open', 'keyboar...",[ 1.21683925e-02 1.49960428e-01 -2.17856288e-...
3,I can't believe it! It's really happening! M...,1,0,1,1,0,"['i', 'cant', 'believe', 'it', 'its', 'really'...","[""i can't believe it!"", ""it's really happening...","[('i', 'cant'), ('cant', 'believe'), ('believe...","[('i', 'cant', 'believe'), ('cant', 'believe',...",[-1.21900747e-02 1.94802403e-01 -2.04183444e-...
4,"Well, here I go with the good old stream of co...",1,0,1,0,1,"['well', 'here', 'i', 'go', 'with', 'the', 'go...","['well, here i go with the good old stream of ...","[('well', 'here'), ('here', 'i'), ('i', 'go'),...","[('well', 'here', 'i'), ('here', 'i', 'go'), (...",[-6.53621508e-03 1.72239631e-01 -2.12745324e-...


# Data cleaning

In [8]:
import string

def text_cleaning(text):
    """
    Removes punctuation and stopwords, and converts text to lowercase.
    """

    no_punc=text.translate(str.maketrans("","",string.punctuation))
    
    stop_words = set(stopwords.words('english'))

    word_tokens = word_tokenize(no_punc)

    filtered_tokens=[]

    for token in word_tokens:

        if token.lower() not in stop_words:

            filtered_tokens.append(token)
    
    no_punc=" ".join(filtered_tokens)

    return no_punc.lower()

In [9]:
essays["clean text"]=essays.apply(lambda x: text_cleaning(x["TEXT"]), axis=1)

# Training and testing functions

In [10]:
def train_lr(df_train, vectorizer, personality, lr_kwargs={"penalty":'none', "max_iter": 1000, "solver": "lbfgs"}):
    """
    Receives the train set `df_train` as pd.DataFrame and extracts lemma n-grams
    with their correspoding labels (news type).
    The text is vectorized and used to train a logistic regression with
    training arguments passed as `lr_kwargs`.
    Returns the fitted model.
    """
    vectorizer.set_params(max_df=df_train.shape[0])
    X=vectorizer.fit_transform(df_train["clean text"])
    model=LR(**lr_kwargs)
    model.fit(X, df_train[[personality]])
    return model


def train_svm(df_train, vectorizer, personality):
    """
    Receives the train set `df_train` as pd.DataFrame and extracts lemma n-grams
    with their correspoding labels (news type).
    The text is vectorized and used to train a logistic regression with
    training arguments passed as `lr_kwargs`.
    Returns the fitted model.
    """
    vectorizer.set_params(max_df=df_train.shape[0])
    X=vectorizer.fit_transform(df_train["clean text"])
    model=SVC(kernel="linear")
    model.fit(X, df_train[[personality]])
    return model


def test_performance(model, df_test, vectorizer, personality):

    X_test, y_test = df_test["clean text"], df_test[[personality]]
    X_vec = vectorizer.transform(X_test)
    y_pred = model.predict(X_vec)
    #print(classification_report(y_pred=y_pred, y_true=y_test))
    return f1_score(y_pred=y_pred, y_true=y_test, average="macro")


def influential_ngrams(model, vectorizer, is_lr=True):
    """
    Receives a model (LR or SVM) and a vectorizer.
    Prints the most influential n-grams.
    """

    if is_lr:
        print("Logistic regression\n")
        print("The most influential n-grams for classification 1 are:")
        ind = np.argsort(model.coef_)[0][-10:]
        for index in ind:
            print(vectorizer.get_feature_names()[index])


        print("The most influential n-grams for classification 0 are:")
        ind = np.argsort(model.coef_)[0][:10]
        for index in ind:
            print(vectorizer.get_feature_names()[index])

    else:
        print("SVM\n")
        print("The most influential n-grams for classification 1 are:")
        ind = np.argsort(svm.coef_.toarray())[0][-10:]
        for index in ind:
            print(count_vectorizer.get_feature_names()[index])

        print("The most influential n-grams for classification 0 are:")
        ind = np.argsort(svm.coef_.toarray())[0][:10]
        for index in ind:
            print(count_vectorizer.get_feature_names()[index])


# Cross validation function

In [11]:
def cross_validation(data, personality, n_splits):

    kf=KFold(n_splits=n_splits)
    
    lr_f1=[]
    svm_f1=[]

    cv=CountVectorizer(lowercase=False, ngram_range=(1,3), analyzer="word")

    for i, (train_index, test_index) in enumerate(kf.split(data)):

        data_train=data.iloc[train_index]
        data_test=data.iloc[test_index]

        lr=train_lr(data_train, cv, personality)
        svm=train_svm(data_train, cv, personality)

        lr_f1.append(test_performance(lr, data_test, cv, personality))
        svm_f1.append(test_performance(svm, data_test, cv, personality))

        print(f"Fold {i+1} done.")

    print(f"The {n_splits}-fold cross-validation f1 score for LR is {sum(lr_f1)/len(lr_f1)}.")
    print(f"The {n_splits}-fold cross-validation f1 score for SVM is {sum(svm_f1)/len(svm_f1)}.")

# Results

In [12]:
cross_validation(essays, "cEXT", 5)

  return f(**kwargs)
  return f(**kwargs)


Fold 1 done.


  return f(**kwargs)


Fold 2 done.


  return f(**kwargs)


Fold 3 done.


  return f(**kwargs)


Fold 4 done.


  return f(**kwargs)


Fold 5 done.
The 5-fold cross-validation f1 score for LR is 0.5626768406354474.
The 5-fold cross-validation f1 score for SVM is 0.5543829072594694.


In [13]:
cross_validation(essays, "cAGR", 5)

  return f(**kwargs)


Fold 1 done.


  return f(**kwargs)


Fold 2 done.


  return f(**kwargs)


Fold 3 done.


  return f(**kwargs)


Fold 4 done.


  return f(**kwargs)


Fold 5 done.
The 5-fold cross-validation f1 score for LR is 0.5297914513687948.
The 5-fold cross-validation f1 score for SVM is 0.5116674656075302.


In [14]:
cross_validation(essays, "cOPN", 5)

  return f(**kwargs)


Fold 1 done.


  return f(**kwargs)


Fold 2 done.


  return f(**kwargs)


Fold 3 done.


  return f(**kwargs)


Fold 4 done.


  return f(**kwargs)


Fold 5 done.
The 5-fold cross-validation f1 score for LR is 0.5865769338568414.
The 5-fold cross-validation f1 score for SVM is 0.5853457713928786.


In [15]:
cross_validation(essays, "cCON", 5)

  return f(**kwargs)


Fold 1 done.


  return f(**kwargs)


Fold 2 done.


  return f(**kwargs)


Fold 3 done.


  return f(**kwargs)


Fold 4 done.


  return f(**kwargs)


Fold 5 done.
The 5-fold cross-validation f1 score for LR is 0.5474437765020095.
The 5-fold cross-validation f1 score for SVM is 0.559477404630812.


In [16]:
cross_validation(essays, "cNEU", 5)

  return f(**kwargs)


Fold 1 done.


  return f(**kwargs)


Fold 2 done.


  return f(**kwargs)


Fold 3 done.


  return f(**kwargs)


Fold 4 done.


  return f(**kwargs)


Fold 5 done.
The 5-fold cross-validation f1 score for LR is 0.5381621810053062.
The 5-fold cross-validation f1 score for SVM is 0.5479719392535151.
