In [2]:
%load_ext autoreload
%autoreload 2

import os
from pathlib import Path
import numpy as np

from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))


# Override/set credentials in env var
os.environ['CWD'] = str(Path(os.getcwd()).parent)

# Base paths
cwd = Path(os.environ['CWD'])
dir_data = cwd / 'data'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import pandas as pd

df_tr_ev = pd.read_csv(dir_data / 'train.csv')
df_tr_ev['split'] = 'tr'
df_te = pd.read_csv(dir_data / 'test.csv')
df_te['split'] = 'te'

df = pd.concat([df_tr_ev, df_te])

In [1]:
from sklearn.model_selection import train_test_split
df_tr, df_ev = train_test_split(df_tr_ev, test_size=0.25, stratify=df_tr_ev['author'], random_state=288)

In [None]:
def get_text_stats_features(df):
    return df[['polarity',
       'subjectivity', 'flesch_reading_ease', 'flesch_kincaid_grade',
       'gunning_fog', 'automated_readability_index', 'coleman_liau_index',
       'linsear_write_formula', 'dale_chall_readability_score',
       'mcalpine_eflaw', 'reading_time', 'syllable_count', 'lexicon_count',
       'char_count', 'letter_count', 'polysyllabcount', 'monosyllabcount']].to_numpy()

X_tr_stats = get_text_stats_features(df_tr)
X_ev_stats = get_text_stats_features(df_ev)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_char = TfidfVectorizer(
    ngram_range=(1,5),
    analyzer='char_wb',
    min_df=20,
    max_df=0.7
)



X_tfidf_char_tr = tfidf_char.fit_transform(df_tr['text_processed'])
display(len(tfidf_char.vocabulary_))
X_tfidf_char_ev = tfidf_char.transform(df_ev['text_processed'])

In [None]:
from sklearn.decomposition import TruncatedSVD

tsvd_char = TruncatedSVD(n_components=1300, random_state=288)
X_tfidf_char_tsvd_tr = tsvd_char.fit_transform(X_tfidf_char_tr)
print(f'Cumulative sum of explained variance ratio kept by tsvd_char: {round(tsvd_char.explained_variance_ratio_.cumsum()[-1], 4)}')
X_tfidf_char_tsvd_ev = tsvd_char.transform(X_tfidf_char_ev)

In [None]:
# Lemmatize first
tfidf_word = TfidfVectorizer(
    ngram_range=(1,3),
    analyzer='word',
    min_df=10,
    max_df=0.8
)

X_tfidf_word_tr = tfidf_word.fit_transform(df_tr['text_lemmatized_processed'])
display(len(tfidf_word.vocabulary_))
X_tfidf_word_ev = tfidf_word.transform(df_ev['text_lemmatized_processed'])


tsvd_word = TruncatedSVD(n_components=400, random_state=288)
X_tfidf_word_tsvd_tr = tsvd_word.fit_transform(X_tfidf_word_tr)
print(f'Cumulative sum of explained variance ratio kept by tsvd_word: {round(tsvd_word.explained_variance_ratio_.cumsum()[-1], 4)}')
X_tfidf_word_tsvd_ev = tsvd_word.transform(X_tfidf_word_ev)

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

In [None]:
X_tr = np.hstack([X_tr_stats, X_tfidf_tsvd_tr])
X_ev = np.hstack([X_ev_stats, X_tfidf_tsvd_ev])

In [None]:
X_tr_scaled = scaler.fit_transform(X_tr)
X_ev_scaled = scaler.transform(X_ev)