In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from drop_columns import DropColumnsCV

In [2]:
df = pd.read_csv(os.path.join('data', 'features.csv'))
df.head()


Unnamed: 0,content,wording,keyword_overlap,summary_lengths,vocab_size,Average_sentence_lengths_ratio,objectivity,Semantic_similarity,flesch_reading_ease,spell_errors
0,0.205683,0.380538,4,43,36,1.075,0.835,0.854158,1.136179,2
1,-0.548304,0.506755,5,25,19,1.785714,0.947,0.889245,1.319293,0
2,3.128928,4.231226,19,177,119,2.011364,0.816,0.877361,0.906089,3
3,-0.210614,-0.471415,3,18,16,1.227273,1.0,0.706003,0.944525,4
4,3.272894,3.219757,12,151,101,1.078571,0.898,0.86676,1.150467,11


In [3]:
X = df.iloc[:, 2:]
y = df.iloc[:, :2]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5732, 8), (1433, 8), (5732, 2), (1433, 2))

In [4]:
scaler = MinMaxScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)


In [5]:
def mcrmse(y_true, y_pred):
  colwise_mse = np.mean(np.square(y_true - y_pred), axis=1)
  return np.mean(np.sqrt(colwise_mse))

mcrmse_scorer = make_scorer(mcrmse, greater_is_better=False)

### Random Forest Regressor

In [6]:
rf = RandomForestRegressor()

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'estimator__n_estimators': n_estimators,
               'estimator__max_features': max_features,
               'estimator__max_depth': max_depth,
               'estimator__min_samples_split': min_samples_split,
               'estimator__min_samples_leaf': min_samples_leaf,
               'estimator__bootstrap': bootstrap}

search = RandomizedSearchCV(estimator=MultiOutputRegressor(rf),
                            param_distributions=random_grid, 
                            n_iter=100,
                            scoring=mcrmse_scorer, 
                            cv=5, 
                            verbose=5,
                            n_jobs=-1, 
                            random_state=42)

search.fit(X_train_scale, y_train)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


KeyboardInterrupt: 

In [6]:
print(f'Best score: {abs(search.best_score_)}')
print(f'Best parameters: {search.best_params_}')

model = search.best_estimator_
wrapper = MultiOutputRegressor(model)

NameError: name 'search' is not defined

In [8]:
model.fit(X_train_scale, y_train)
y_pred = model.predict(X_test_scale)
score = mcrmse(y_test, y_pred)
print(f'MCRMSE = {score}')

MCRMSE = 0.5073236339417416


In [11]:
with open(os.path.join('data', 'rfr_parameters.json'), 'w') as FILE:
    json.dump(search.best_params_, FILE)

In [10]:
with open(os.path.join('data', 'rfr_parameters.json'), 'r') as FILE:
    params = json.load(FILE)

edited_params = {}
for key, value in params.items():
    key = key.split('__')[1]
    edited_params[key] = value



In [8]:
model = MultiOutputRegressor(RandomForestRegressor(**edited_params))
col_search = DropColumnsCV(model, X_train, y_train, scaler, mcrmse_scorer, 7)

col_search.eval_combinations()

Fitting 5 folds over 254 candidates, total 1270

Candidate 1
Index(['summary_lengths', 'vocab_size', 'Average_sentence_lengths_ratio',
       'objectivity', 'Semantic_similarity', 'flesch_reading_ease',
       'spell_errors'],
      dtype='object')
Mean score:  0.5234 STD:  0.0088

Candidate 2
Index(['keyword_overlap', 'vocab_size', 'Average_sentence_lengths_ratio',
       'objectivity', 'Semantic_similarity', 'flesch_reading_ease',
       'spell_errors'],
      dtype='object')
Mean score:  0.5176 STD:  0.0141

Candidate 3
Index(['keyword_overlap', 'summary_lengths', 'Average_sentence_lengths_ratio',
       'objectivity', 'Semantic_similarity', 'flesch_reading_ease',
       'spell_errors'],
      dtype='object')
Mean score:  0.5098 STD:  0.0113

Candidate 4
Index(['keyword_overlap', 'summary_lengths', 'vocab_size', 'objectivity',
       'Semantic_similarity', 'flesch_reading_ease', 'spell_errors'],
      dtype='object')
Mean score:  0.5225 STD:  0.0123

Candidate 5
Index(['keyword_over

In [14]:
model2 = MultiOutputRegressor(RandomForestRegressor(**edited_params))

X_train = X_train[col_search.best_features_]
X_train_scale = scaler.fit_transform(X_train)
X_test = X_test[col_search.best_features_]
X_test_scale = scaler.transform(X_test)

model2.fit(X_train_scale, y_train)

In [15]:
y_pred = model2.predict(X_test_scale)
score = mcrmse(y_test, y_pred)
print(f'MCRMSE = {score}')

MCRMSE = 0.5116162378502621
