In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor


In [2]:
df = pd.read_csv(os.path.join('data', 'features.csv'))
df.head()


Unnamed: 0,content,wording,keyword_overlap,summary_lengths,vocab_size,Average_sentence_lengths_ratio,objectivity,Semantic_similarity,flesch_reading_ease,spell_errors
0,0.205683,0.380538,4,43,36,1.075,0.835,0.854158,1.136179,2
1,-0.548304,0.506755,5,25,19,1.785714,0.947,0.889245,1.319293,0
2,3.128928,4.231226,19,177,119,2.011364,0.816,0.877361,0.906089,3
3,-0.210614,-0.471415,3,18,16,1.227273,1.0,0.706003,0.944525,4
4,3.272894,3.219757,12,151,101,1.078571,0.898,0.86676,1.150467,11


In [3]:
X = df.iloc[:, 2:]
y = df.iloc[:, :2]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5732, 8), (1433, 8), (5732, 2), (1433, 2))

In [4]:
scaler = MinMaxScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)


In [5]:
def mcrmse(y_true, y_pred):
  colwise_mse = np.mean(np.square(y_true - y_pred), axis=1)
  return np.mean(np.sqrt(colwise_mse))

mcrmse_scorer = make_scorer(mcrmse, greater_is_better=False)

### Tree Decision Regressor

In [6]:
tree = DecisionTreeRegressor()

n_scores = cross_val_score(
    tree, 
    X_train_scale, 
    y_train, 
    cv=5,
    scoring=mcrmse_scorer,
    verbose=2,
    n_jobs=-1
)

n_scores = np.absolute(n_scores)
print(f'scores: {n_scores}')
print(f'Mean score: {np.mean(n_scores): .4f} STD: {np.std(n_scores): .4f}')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


scores: [0.69351286 0.7078456  0.6739986  0.70124276 0.71646035]
Mean score:  0.6986 STD:  0.0144


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.8s finished


In [8]:
tree.fit(X_train_scale, y_train)
y_pred = tree.predict(X_test_scale)
score = mcrmse(y_test, y_pred)
print(f'MCRMSE = {score}')

MCRMSE = 0.697814603686775


### Random Forest Regressor

In [10]:
rf = RandomForestRegressor()
wrapper = MultiOutputRegressor(rf)

n_scores = cross_val_score(
    wrapper, 
    X_train_scale, 
    y_train, 
    cv=5,
    scoring=mcrmse_scorer,
    verbose=2,
    n_jobs=-1
)

n_scores = np.absolute(n_scores)
print(f'scores: {n_scores}')
print(f'Mean score: {np.mean(n_scores): .4f} STD: {np.std(n_scores): .4f}')


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


scores: [0.50917815 0.51239588 0.49370415 0.51559213 0.51227835]
Mean score:  0.5086 STD:  0.0077


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   31.3s finished


In [11]:
wrapper.fit(X_train_scale, y_train)
y_pred = wrapper.predict(X_test_scale)
score = mcrmse(y_test, y_pred)
print(f'MCRMSE = {score}')

MCRMSE = 0.5132062094658244


### Linear Regression

In [12]:
lr = LinearRegression()

n_scores = cross_val_score(
    lr, 
    X_train_scale, 
    y_train, 
    cv=5,
    scoring=mcrmse_scorer,
    verbose=2,
    n_jobs=-1
)

n_scores = np.absolute(n_scores)
print(f'scores: {n_scores}')
print(f'Mean score: {np.mean(n_scores): .4f} STD: {np.std(n_scores): .4f}')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


scores: [0.60209051 0.60071544 0.57630361 0.59440548 0.59800217]
Mean score:  0.5943 STD:  0.0094


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.8s finished


In [13]:
lr.fit(X_train_scale, y_train)
y_pred = lr.predict(X_test_scale)
score = mcrmse(y_test, y_pred)
print(f'MCRMSE = {score}')

MCRMSE = 0.5928096875186115
