In [1]:
import numpy as np
import pandas as pd
import math
from tools import *

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt

  pd.set_option('display.max_colwidth', -1)


In [2]:
def add_creative_features(dataframe):
    """
    This function applies the tools.py functions I wrote into every excerpt.
    """
    dataframe['num_punct_marks'] = dataframe['excerpt'].apply(num_punct_marks)
    dataframe['num_uniq_words'] = dataframe['excerpt'].apply(num_unique_words)
    dataframe['avg_word_len'] = dataframe['excerpt'].apply(avg_word_len)
    dataframe['rarity'] = dataframe['excerpt'].apply(rarity)


def add_clasic_test(dataframe):
    """
    This function applies all the publicly available readability test into every excerpt.
    """
    clasical_complexity_tests = {'fre_test': textstat.flesch_reading_ease,
                                 'fkg_test': textstat.flesch_kincaid_grade,
                                 'gf_test': textstat.gunning_fog,
                                 'si_test': textstat.smog_index,
                                 'dcrs_test': textstat.dale_chall_readability_score}

    # Creating text complexity feature for every test:
    for test in clasical_complexity_tests.keys():
        test_func = clasical_complexity_tests[test]

        dataframe[test] = dataframe['excerpt'].apply(lambda value: test_func(value))

## Final Model

* After a lot of iteration and testing, this is the best model I have found so far.
* The main improvement here is adding the TF-IDF (Term frequency - Inverse Frequency) pre-procesing to the mix.
* Also I tweak the hyperparameters of best models to find the best combination.

#### I'm going to import the train.csv again to mantain order:

In [3]:
train = pd.read_csv('train.csv', usecols=['id', 'excerpt', 'target'])
validation = pd.read_csv('validation.csv', usecols=['id', 'excerpt', 'target'])

In [4]:
add_creative_features(train)
add_clasic_test(train)

add_creative_features(validation)
add_clasic_test(validation)

In [5]:
# Experiment 0: tfidf alone
X_train = train['excerpt']
y_train = train['target']

X_val = validation['excerpt']
y_val = validation['target']

models = {'MLPRegressor': MLPRegressor(),
          'SVR': SVR(kernel = 'rbf'),
          'Ridge': Ridge()}

for model, regr in models.items():
    pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('rgr', regr)])
    pipeline.fit(X_train, y_train)
    
    # validation predictions
    preds = pipeline.predict(X_val)
    
    # metrics
    r2 = round(pipeline.score(X_val, y_val), 3)
    mae = round(mean_absolute_error(y_val, preds), 3)
    rmse = round(math.sqrt(mean_squared_error(y_val, preds)), 3)
    
    # Printing results
    print(f'{model} vs target... R2: {r2}')
    print(f'{model}  vs target... MAE: {mae}')
    print(f'{model}  vs target... RMSE: {rmse}', '\n')

MLPRegressor vs target... R2: 0.51
MLPRegressor  vs target... MAE: 0.594
MLPRegressor  vs target... RMSE: 0.712 

SVR vs target... R2: 0.504
SVR  vs target... MAE: 0.59
SVR  vs target... RMSE: 0.717 

Ridge vs target... R2: 0.542
Ridge  vs target... MAE: 0.577
Ridge  vs target... RMSE: 0.689 



In [6]:
# Experiment 1: joining previous output as input + normal variables
train['ridge_preds'] = pipeline.predict(X_train)
validation['ridge_preds'] = pipeline.predict(X_val)

variables = ['rarity', 'avg_word_len', 'fre_test', 'dcrs_test', 'ridge_preds']

# Let's iterate over every candidate model, train it and compare results

models = {'MLPRegressor': MLPRegressor(max_iter=1000, learning_rate='adaptive', early_stopping=True),
          'SVR': SVR(kernel='rbf', C=10),
          'Ridge': Ridge(alpha=2)}

for model, regressor in models.items():
    # training:
    X_train = train[variables].values
    
    # fitting model
    regressor.fit(X_train, y_train)
    
    # checking the model results in the validation set
    X_test = validation[variables].values
    X_test_pred = regressor.predict(X_test)
    
    # metrics
    r2 = round(regressor.score(X_test, y_val), 3)
    mae = round(mean_absolute_error(y_val, X_test_pred), 3)
    rmse = round(math.sqrt(mean_squared_error(y_val, X_test_pred)), 3)
    
    # Printing results
    print(f'{model} vs target... R2: {r2}')
    print(f'{model}  vs target... MAE: {mae}')
    print(f'{model}  vs target... RMSE: {rmse}', '\n')

MLPRegressor vs target... R2: 0.529
MLPRegressor  vs target... MAE: 0.584
MLPRegressor  vs target... RMSE: 0.698 

SVR vs target... R2: 0.546
SVR  vs target... MAE: 0.574
SVR  vs target... RMSE: 0.686 

Ridge vs target... R2: 0.508
Ridge  vs target... MAE: 0.6
Ridge  vs target... RMSE: 0.714 

