<a href="https://colab.research.google.com/github/john-mai-2605/language-difficulty-prediction/blob/master/full_run.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install cmudict
!pip install imblearn

Collecting cmudict
[?25l  Downloading https://files.pythonhosted.org/packages/fe/cf/4d24ac4f3ea5a57406a690ad7c07023c310185eac99adae7473c9ebdf550/cmudict-0.4.4-py2.py3-none-any.whl (938kB)
[K     |████████████████████████████████| 942kB 3.4MB/s 
[?25hInstalling collected packages: cmudict
Successfully installed cmudict-0.4.4


In [2]:
import pandas as pd
import re
import numpy as np
import cmudict
from sklearn import *
import matplotlib.pyplot as plt
from collections import Counter
import pickle
from sklearn.externals.joblib import parallel_backend

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import RobustScaler, OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import GridSearchCV

from sklearn.cluster import KMeans
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
from sklearn.linear_model import LinearRegression, SGDRegressor, PassiveAggressiveRegressor, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.kernel_ridge import KernelRidge

import xgboost as xgb 
import lightgbm as lgb



In [3]:
# clustering
n_clusters = 35
tag_cluster_error_rate = 'tag_cluster_error_rate'
tag_clustering_features = [
    'action', 'adventure', 'american', 'animal',
    'animation', 'australian', 'british', 'comedy',
    'cooking', 'documentary', 'drama', 'education',
    'english', 'fantasy', 'food', 'foreign accent',
    'interview', 'monologue', 'movie', 'news',
    'review', 'romance', 'sciencefiction', 'sitcom',
    'song', 'speech', 'superhero', 'tvseries',
    'talkshow', 'technology', 'thriller', 'trailer',
]
tag_cluster = 'tag_cluster'
corr_cluster_error_rate = 'corr_cluster_error_rate'
corr_clustering_features = ['elapse_time', 'speed', 'wpm', 'aveAmbiguity']
corr_cluster = 'corr_cluster'

feature_columns = ['length',
            'aveLength', 'maxLength', 'minLength',
            'aveFreq', 'maxFreq', 'minFreq', 
            'aveDepth', 'maxDepth', 'minDepth', 
            'aveDensity', 'minDensity', 'maxDensity',
            'aveAmbiguity', 'minAmbiguity', 'maxAmbiguity', 
            'wpm', 'elapse_time', 'speed',
            'noun', 'verb', 'adj', 'adv',
            'det', 'prep', 'norm',
            'action', 'adventure', 'american', 'animal',
            'animation', 'australian', 'british', 'comedy',
            'cooking', 'documentary', 'drama', 'education',
            'english', 'fantasy', 'food', 'foreign accent',
            'interview', 'monologue', 'movie', 'news',
            'review', 'romance', 'sciencefiction', 'sitcom',
            'song', 'speech', 'superhero', 'tvseries',
            'talkshow', 'technology', 'thriller', 'trailer',
            tag_cluster_error_rate, corr_cluster_error_rate,
            ]

In [4]:
class Dataset:
    def __init__(self, path):
        self.data = pd.read_csv(path)
        # self.X = self.data[schema]
        self.y = self.data['error_rate']
        self.scaler = RobustScaler()
        self.imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
        self.enc_oh = OneHotEncoder()
        self.enc_ord = OrdinalEncoder()
        self.kmeans = KMeans(n_clusters=n_clusters, random_state=123)
    def split(self, test_ratio):
        return model_selection.train_test_split(self.data, self.y, test_size = test_ratio, random_state=42)
    
    def encode(self):
        pass
        # self.X['tag'] = self.enc_ord.fit_transform(self.X['tag'].to_numpy().reshape(-1, 1))
        # self.X['genreTag'] = self.enc_ord.fit_transform(self.X['genreTag'].to_numpy().reshape(-1, 1))

    def normalize(self, X_train, X_test):
        self.imputer.fit(X_train)
        X_train = self.imputer.transform(X_train)
        X_test = self.imputer.transform(X_test)
        self.scaler.fit(X_train) 
        return self.scaler.transform(X_train), self.scaler.transform(X_test)
    def select(self, X_train, y_train, X_test, k = 20):
        self.selector = SelectKBest(chi2, k)
        self.selector.fit(X_train, y_train)
        return self.selector.transform(X_train), self.selector.transform(X_test)
    def add_clustering_features(self, train, test, clustering_features, cluster_feature_name, normalize=False):
        clustering_X_train = train[clustering_features]
        clustering_X_test = test[clustering_features]
        if (normalize == True):
            scaler = MinMaxScaler()
            scaler.fit(clustering_X_train)
            train_for_clustering = scaler.transform(clustering_X_train)
            test_for_clustering = scaler.transform(clustering_X_test)
        # duplicate code to impute before clustering
        self.imputer.fit(clustering_X_train)
        clustering_X_train = self.imputer.transform(clustering_X_train)
        clustering_X_test = self.imputer.transform(clustering_X_test)
        #####
        self.kmeans.fit(clustering_X_train)
        train_labels = self.kmeans.predict(clustering_X_train)
        test_labels = self.kmeans.predict(clustering_X_test)
        train['cluster'] = train_labels
        test['cluster'] = test_labels
        cluster_to_median = {}
        for i in range(n_clusters):
            cluster_to_median[i] = train['error_rate'].loc[train['cluster'] == i].median()
        train[cluster_feature_name] = train['cluster'].apply(lambda c:cluster_to_median[c])
        test[cluster_feature_name] = test['cluster'].apply(lambda c:cluster_to_median[c])
        return [train, test]

In [5]:
class Regressor:
    def __init__(self, reg):
        self.reg = reg
    def fit(self, X, y, **kwargs):
        return self.reg.fit(X, y, **kwargs)
    def predict(self, X_test):
        return self.reg.predict(X_test)
    def evaluate(self, y_test, y_pred): 
        mse = metrics.mean_squared_error(y_test, y_pred)
        r2 = metrics.r2_score(y_test, y_pred)
        print(np.sqrt(mse), r2)
        clf_test = [map_to_class(y) for y in y_test]
        clf_pred = [map_to_class(y) for y in y_pred]
        print(metrics.confusion_matrix(clf_test, clf_pred))
        print(metrics.classification_report(clf_test, clf_pred))
    def tune(self, X_train, Y_train, param_grid,
             n_folds = 10, result_filename = 'reg_tuning_results.csv'):
        grid_cv = GridSearchCV(estimator = self.reg,
                           param_grid = param_grid,
                           cv = n_folds,
                           scoring = 'neg_mean_squared_error',
                           verbose = 1,
                           n_jobs = -1
                          )
        grid_cv.fit(X_train, Y_train)
        # save results
        tuning_results = pd.DataFrame(grid_cv.cv_results_)
        tuning_results.to_csv(result_filename)

        # set best params
        best_params = grid_cv.best_params_
        self.reg.set_params(**best_params)

    def run(self, X, y, X_test, y_test, **kwargs):
        fitted_model = self.fit(X, y, **kwargs)
        y_pred = self.predict(X_test)
        self.evaluate(y_test, y_pred)
        try:
            importance = list(zip(fitted_model.feature_importances_, feature_columns))
            importance.sort(reverse=True)
            print(importance)
        except:
            pass
        return y_pred
    def select(self, X_train, X_test, k = 50):
        self.selector = SelectFromModel(self.reg, prefit = True, threshold = -np.inf, max_features = k)
        return self.selector.transform(X_train), self.selector.transform(X_test)

In [6]:
def map_to_class(score, s1 = 2/3, s2 = 1/3):
    if score > s1:
        return 2
    if score > s2:
        return 1
    return 0

In [7]:
data = Dataset('./processed_data.csv')
# data.encode()
# data.selection()
train, test, y_train, y_test = data.split(0.2)
# add tag clustering feature
train, test = data.add_clustering_features(train, test, tag_clustering_features, tag_cluster_error_rate)
# add correlation based clustering feature
train, test = data.add_clustering_features(train, test, corr_clustering_features, corr_cluster_error_rate, normalize = True)

X_train = train[feature_columns]
X_test = test[feature_columns]
print(X_train[tag_cluster_error_rate].describe())
print(X_train[corr_cluster_error_rate].describe())
print(X_test[tag_cluster_error_rate].describe())
print(X_test[corr_cluster_error_rate].describe())
print('features', feature_columns)
print('train size', X_train.shape)
print('test size', X_test.shape)
X_train, X_test = data.normalize(X_train, X_test)
y_train_clf = [map_to_class(y) for y in y_train]
y_test_clf = [map_to_class(y) for y in y_test]

#X_train, X_test = data.select(X_train, y_train, X_test, 20)

X = np.insert(X_train, 0, y_train, axis=1)
smote = SMOTE(random_state=27)
y_train_clf = [map_to_class(y) for y in y_train]
X_new, _ = smote.fit_resample(X, y_train_clf)
X_train, y_train = X_new[:, 1:], X_new[:,[0]].flatten()

y_train_clf = [map_to_class(y) for y in y_train]
y_test_clf = [map_to_class(y) for y in y_test]
print(sorted(Counter(y_train_clf).items()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

count    19386.000000
mean         0.354542
std          0.082364
min          0.144429
25%          0.283871
50%          0.339087
75%          0.431692
max          0.587882
Name: tag_cluster_error_rate, dtype: float64
count    19386.000000
mean         0.360312
std          0.184167
min          0.010328
25%          0.196970
50%          0.377674
75%          0.532843
max          0.640907
Name: corr_cluster_error_rate, dtype: float64
count    4847.000000
mean        0.354840
std         0.082620
min         0.144429
25%         0.283871
50%         0.339087
75%         0.431692
max         0.587882
Name: tag_cluster_error_rate, dtype: float64
count    4847.000000
mean        0.361704
std         0.183579
min         0.010328
25%         0.196970
50%         0.377674
75%         0.532843
max         0.640907
Name: corr_cluster_error_rate, dtype: float64
features ['length', 'aveLength', 'maxLength', 'minLength', 'aveFreq', 'maxFreq', 'minFreq', 'aveDepth', 'maxDepth', 'minDepth', 'a



[(0, 9165), (1, 9165), (2, 9165)]


In [None]:
models = [
        ElasticNet(), Lasso(), BayesianRidge(),
        LassoLarsIC(), RandomForestRegressor(), HuberRegressor(), 
        svm.SVR(), neural_network.MLPRegressor(), LinearRegression(), 
        SGDRegressor(), PassiveAggressiveRegressor(), 
        xgb.XGBRegressor(), lgb.LGBMRegressor(), GradientBoostingRegressor()
        ]
EN_param_grid = {'alpha': [0.001, 0.01, 0.0001], 'copy_X': [True], 'l1_ratio': [0.6, 0.7], 'fit_intercept': [True], 'normalize': [False], 
                         'precompute': [False], 'max_iter': [300, 3000], 'tol': [0.001], 'selection': ['random', 'cyclic'], 'random_state': [None]}
LASS_param_grid = {'alpha': [0.001, 0.0001, 0.00001, 0.000001], 'copy_X': [True], 'fit_intercept': [True], 'normalize': [False], 'precompute': [False], 
                    'max_iter': [3000], 'tol': [0.1, 0.01, 0.001], 'selection': ['random'], 'random_state': [42]}
GB_param_grid = {'loss': ['huber'], 'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [3000], 'max_depth': [3, 10], 
                                        'min_samples_split': [0.0025], 'min_samples_leaf': [5]}
BR_param_grid = {'n_iter': [200, 1000], 'tol': [0.00001, 0.0001], 'alpha_1': [0.00000001, 0.00000005], 'alpha_2': [0.000005, 0.00001], 'lambda_1': [0.000005, 0.00001], 
                 'lambda_2': [0.00000001, 0.00000005], 'copy_X': [True]}
LL_param_grid = {'criterion': ['aic'], 'normalize': [True], 'max_iter': [100, 1000], 'copy_X': [True], 'precompute': ['auto'], 'eps': [0.000001, 0.00001, 0.0001]}
RFR_param_grid = {'n_estimators': [50, 500], 'max_features': ['auto'], 'max_depth': [None], 'min_samples_split': [5], 'min_samples_leaf': [2]}
XGB_param_grid = {'learning_rate': [0.1], 'n_estimators': [10000]}
LGB_param_grid = {'objective': ['regression'], 'learning_rate': [0.05, 0.1, 0.5], 'n_estimators': [300, 3000]}
SVR_param_grid = {'kernel': ['rbf']} #['linear', 'poly', 'rbf', 'sigmoid']}
MLP_param_grid = {'hidden_layer_sizes': [(100,), (100, 10)], 'random_state': [42], 'max_iter': [100, 1000],  'alpha': [0.01, 0.001, 0.0001]}
LR_param_grid = {}
GDR_param_grid = {
                'max_iter': 5000,
                'loss': ['squared_loss'],
                'penalty': ['l2', 'elasticnet', 'l1'],
                'l1_ratio': [0.7, 0.3],
                'learning_rate': ['optimal'],
                'alpha': [1e-01, 1e-2],
                'epsilon': [1e-01],
                'tol': [0.001, 0.003],
                'eta0': [0.01],
                'power_t': [0.5]}
PAR_param_grid = {'whiten': [True, False], 'loss': ['squared_epsilon_insensitive', 'epsilon_insensitive'], 
                  'C': [0.001, 0.005, 0.003], 'max_iter': [1000], 'epsilon': [0.00001, 0.00005],
                  'tol': [1e-03, 1e-05]}
HR_param_grid = {'max_iter': [2000], 'alpha': [0.0001, 1e-05,], 
               'tol': [1e-01, 1e-02]}
params_grids = [
                EN_param_grid, LASS_param_grid, BR_param_grid, 
                LL_param_grid, RFR_param_grid, HR_param_grid,
                SVR_param_grid, MLP_param_grid, LR_param_grid, 
                GDR_param_grid, PAR_param_grid, 
                XGB_param_grid, LGB_param_grid, GB_param_grid
                ]
regs = []
params = []

for model, param_grid in zip(models, params_grids):
    print(model.__class__.__name__)
    regressor = Regressor(model)
    print('Start tuning')
    with parallel_backend('threading'):
      regressor.tune(X_train, y_train, param_grid, result_filename = model.__class__.__name__ + ' hyperparameters.csv')
    print('Finish tuning')
    params.append(regressor.reg.get_params())
    y = regressor.run(X_train, y_train, X_test, y_test)
    regs.append(regressor)

with open('regs.pkl', 'wb') as f:
    pickle.dump(regs, f)
with open('params.pkl', 'wb') as f:
    pickle.dump(params, f)

ElasticNet
Start tuning
Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   57.4s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  1.3min finished


Finish tuning
0.1481723513869192 0.6793531217693163
[[1619  640   16]
 [ 162 1383  171]
 [   9  391  456]]
              precision    recall  f1-score   support

           0       0.90      0.71      0.80      2275
           1       0.57      0.81      0.67      1716
           2       0.71      0.53      0.61       856

    accuracy                           0.71      4847
   macro avg       0.73      0.68      0.69      4847
weighted avg       0.75      0.71      0.72      4847

Lasso
Start tuning
Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  1.5min finished


Finish tuning
0.14814994905475032 0.6794500723108605
[[1626  634   15]
 [ 164 1379  173]
 [   9  393  454]]
              precision    recall  f1-score   support

           0       0.90      0.71      0.80      2275
           1       0.57      0.80      0.67      1716
           2       0.71      0.53      0.61       856

    accuracy                           0.71      4847
   macro avg       0.73      0.68      0.69      4847
weighted avg       0.75      0.71      0.72      4847

BayesianRidge
Start tuning
Fitting 10 folds for each of 64 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   26.6s


In [None]:
# Stacking
estimators = [('bt', BT.reg), ('lgb', lg.reg), ('gb', gb.reg)]
reg = ensemble.StackingRegressor(estimators=estimators, final_estimator=rf.reg, n_jobs = -1, passthrough = True)
stack = Regressor(reg)
y = stack.run(X_train, y_train, X_test, y_test)
# with open ('y_pred_regs.pkl', 'rb') as f:
# 	y_preds = pickle.load(f)
with open ('regs.pkl', 'rb') as f:
	regs = pickle.load(f)

y_preds.append(y)
regs.append(y)
val = input("Enter to save, Ctrl+C to stop") 
# with open ('y_pred_regs.pkl', 'wb') as f:
# 	pickle.dump(y_preds, f)
with open ('regs.pkl', 'wb') as f:
	pickle.dump(regs, f)

In [None]:
print("---------Averaging Blending Regressor---------")

with open ('regs.pkl', 'rb') as f:
	regs = pickle.load(f)
y_pred_vals = [reg.predict(X_val) for reg in regs]
weights = [0.5, 0, 0, 0, 0.5, 0]
y_pred_val = sum(x*y for x, y in zip(weights, y_pred_vals))
clf_val = [map_to_class(y) for y in y_pred_val]
print(metrics.confusion_matrix(y_val_clf, clf_val))
print(metrics.classification_report(y_val_clf, clf_val))

y_pred_tests = [reg.predict(X_test) for reg in regs]
weights = [0.5, 0, 0, 0, 0.5, 0]
y_pred_test = sum(x*y for x, y in zip(weights, y_pred_tests))
clf_test = [map_to_class(y) for y in y_pred_test]
print(metrics.confusion_matrix(y_test_clf, clf_test))
print(metrics.classification_report(y_test_clf, clf_test))

y_pred_tests = [reg.predict(X_test) for reg in regs]
weights = [0.5, 0, 0, 0, 0.5, 0]
y_pred_test = sum(x*y for x, y in zip(weights, y_pred_tests))
clf_test = [map_to_class(y) for y in y_pred_test]
print(metrics.confusion_matrix(y_test_clf, clf_test))
print(metrics.classification_report(y_test_clf, clf_test))
print("---------Blending Regressor---------")
X_val = np.append(X_val, np.asarray(y_pred_vals).T, axis = 1)
X_test = np.append(X_test, np.asarray(y_pred_tests).T, axis = 1)
model = Regressor(linear_model.LinearRegression())
model.run(X_val, y_val, X_test, y_test)

In [None]:
with open ('regs.pkl', 'rb') as f:
	regs = pickle.load(f)
tree_model = regs[0]
base_model = regs[2]
X_lr_train = tree_model.reg.apply(X_train)
X_lr_test = tree_model.reg.apply(X_test)
X_train = np.append(X_train, X_lr_train, axis=1)
X_test = np.append(X_test, X_lr_test, axis=1)
X_train = X_lr_train
X_test = X_lr_test

base_model.run(X_lr_train, y_train, X_lr_test, y_test)