# This script looks at the best features after feature selection and combines features to generate best models of combined features. 


In [1]:
# general packages 
import pandas as pd
import pickle
import copy
import numpy as np
import os
import matplotlib as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from collections import Counter

# pipeline packages 
from scipy.stats import pearsonr
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

pd.set_option('display.max_colwidth', None)

# Split Data

In [4]:
"""
This function splits data into train and test, bins CRT score into 7 buckets, and performs stratification. 

Inputs:
data (pandas df): target dataframe 
feats (list): target feature names as strings
target (string): CRT target feature (numeric, conceptual, or both)

Outputs:
X_train (pandas df): X training data (80%)
Y_train (pandas df): Y training data (80%)
X_test (pandas df): X test data (20%)
Y_test (pandas df): Y test data (20%)
"""
def split_data(data, feats, target):
    
    X = copy.deepcopy(data).loc[:, feats]
    Y = data[target]
    
    # stratify CRT scores 
    bin_count = 0
    for i in data[target].value_counts() > 1:
        if i:
            bin_count += 1  
    bin_count -= 1
    
    bins = np.linspace(0, 1, bin_count) # split 0-1 to 7 bins for each CRT score
    y_binned = np.digitize(Y, bins) # return the indices of the bins to which each value in Y (CRT score) belongs

    X_train, X_test, Y_train, Y_test  = train_test_split(X, Y, test_size=0.2, stratify=y_binned)
    return (X_train, X_test, Y_train, Y_test)

# Helper Functions

In [5]:
"""
This function takes in a dictionary of new parameters and adds best parameters from one round of the
pipeline to the new parameter dictionary.

Inputs: 
all_best_params (dictionary): dictionary of lists of best parameters after each round of pipeline
best_params (dictionary): dictionary of best parameters after one round of pipeline

Outputs:
all_best_params (dictionary): dictionary of lists of best parameters after each round of pipeline, after adding best_params
"""
def get_new_params(all_best_params, best_params):
    for key, val in best_params.items():
        all_best_params[key].append(val)
    return all_best_params


In [6]:
"""
This function finds the top three most frequent values in a given list.

Inputs:
my_list (list): list of values 

Outputs: 
result: list of top three most frequent values from my_list (returns < 3 if less than 3 distinct items)
"""

def most_frequent(my_list):
    c = Counter(my_list)
    result = []
    for i in c.most_common(3):
        result.append(i[0])
    return result

# Pipeline

In [7]:
"""
This function returns the selected model 

Inputs:
model (string): name of target model 

Outputs:
my_model (sklearn model object): model of choice
"""
def get_model(model):
    
    model_dict = {
        'ridge': Ridge(),
        'lasso': Lasso(),
        'rfr': RandomForestRegressor()
    }
    
    my_model = model_dict[model]
    
    return my_model

In [8]:
"""
This function passes data through model of choice with cross-validation.
PolynomialFeatures to capture interactions between features and Standardization applied here. 

Inputs:
data (pandas df): master dataframe 
feats (list): target feature names as strings
target (string): CRT target feature (numeric, conceptual, or both)
model (string): name of model of choice 
params (dictionary): parameters for pipeline
randomsearch (boolean): if True, run RandomSearchCV
random_state (integer): random state for train/test split 

Outputs:
r_avg (float): r value from Pearson correlation, averaged across all iterations
p_avg (float): p value from Pearson correlation, averaged across all iterations
score_avg (float): R^2 score from Ridge regression, averaged across all iterations
"""
def model_pipeline(data_train, data_test, feats, target, my_model, params, iters, randomsearch, random_state=None):
    
    model = get_model(my_model)
    
    # pipeline
    pipe = Pipeline([
        ('poly', PolynomialFeatures()),
        ('model', model),
    ])
    
    X_train = data_train[feats]
    Y_train = data_train[target]
    
    X_test = data_test[feats]
    Y_test = data_test[target]

    if randomsearch: 
        grid = RandomizedSearchCV(pipe, param_distributions=params, n_iter = 100, cv = 3, 
                              verbose=0, random_state=17, n_jobs = -1)
    else:
        grid = GridSearchCV(pipe, param_grid=params, cv=3, n_jobs=-1, verbose=0, scoring='neg_mean_squared_error')

    # fit 
    grid.fit(X_train, Y_train)

    # predict 
    y_pred = grid.predict(X_test)
    y_pred=np.maximum(0, np.minimum(y_pred, 1))
    r, p = pearsonr(y_pred, Y_test) 
    best_score = grid.best_score_
    
    if randomsearch:
        best_params = grid.best_params_
    else:
        best_params = None
    
    return (r, p, best_score, best_params)

In [27]:
"""
This function combines features one at a time to find the combination yielding the highest r value

Inputs:
data (pandas df): master dataframe 
selection_df (pandas df): feature selection results
target (string): CRT target feature (numeric, conceptual, or both)
model (string): name of model of choice 
params (dictionary): parameters for pipeline
iters (integer): number of iterations to perform Ridge + cross validation
randomsearch (boolean): if True, run RandomSearchCV
random_state (integer): random state for train/test split 

Outputs:
best_model_results (pandas df): results from combining features, containing information on {feature names, 
                                predictor names, and r values}
"""
def find_best_model(data_train, data_test, selection_df, target, model, params, iters, randomsearch, random_state=None):
    
    feats = []
    feat_names = []
    results = []
    
    selection_df = selection_df.dropna().reset_index()
    
    all_best_params = {
             'model__bootstrap': [],
             'model__max_depth': [],
             'model__max_features': [],
             'model__min_samples_leaf': [],
             'model__min_samples_split': [],
             'model__n_estimators': [],
             'poly__degree': []
        }
    

    for i in range(len(selection_df)):      
        predictors = selection_df.loc[i, 'target_feats'].tolist()
        feat_name = selection_df.loc[i, 'feature']

        feat_names.append(feat_name)
        feats += predictors
                    
        # run model 
        r, p, best_score, best_params = model_pipeline(data_train, data_test, feats, target, model, params, iters, randomsearch, random_state)
        
        if randomsearch: 
            all_best_params = get_new_params(all_best_params, best_params)
                
        current_results = copy.deepcopy([feat_names, feats, r, p, best_score])
        results.append(current_results)

    best_model_results = pd.DataFrame(results, columns=['features', 'predictors', 'r_value', 'p_value', 'score'])
    
    if randomsearch: 
        for k, v in all_best_params.items():
            all_best_params[k] = most_frequent(v)
    
    return (best_model_results, all_best_params)


In [10]:
"""
This function finds the best model and is the mother function to find_best_model

Inputs:
data (pandas df): master dataframe 
selection_df (pandas df): feature selection results
target (string): CRT target feature (numeric, conceptual, or both)
model (string): name of model of choice 
params (dictionary): parameters for pipeline
iters (integer): number of iterations to perform Ridge + cross validation

Outputs: 
best_model_results (pandas df): results from combining features, containing information on {feature names, 
                                predictor names, and r values}
"""

def get_best_models(data_train, data_test, selection_df, target, model, params, iters):
    if model == 'rfr':
        best_model_results_random, best_params = find_best_model(data_train, data_test, selection_df, target, model, params, 1, randomsearch=True)

        best_model_results, discard = find_best_model(data_train, data_test, selection_df, target, model, best_params, iters, randomsearch=False)
    
    else:
        best_model_results, discard = find_best_model(data_train, data_test, selection_df, target, model, params, iters, randomsearch=False)
    
    return best_model_results
        

# Plot and Save Results

In [11]:
"""
This function creates a dataframe organized by decreasing best r value and plots results

Inputs:
best_model_results (pandas df): dataframe with best model results
target (string): CRT target feature (numeric, conceptual, or both)
path (string): path to best model results folder
my_model (string): name of model used 
top (integer): plot top number of feature combinations 
dataframe_type (string): dropped or imputed

display (boolean): if True, display chart

Outputs:
best_model_results (pandas df): dataframe with best model results, sorted by decreasing r value
chart (plotly): plot of best model results, sorted by decreasing r value
"""
def plot(best_model_results, target, path, my_model, top, dataframe_type, display=True):
    
    # sort in reverse order r value 
    best_model_results = best_model_results.sort_values('r_value')[::-1]

    isExist = os.path.exists(path)
    if not isExist:
        os.makedirs(path)
    best_model_results.to_pickle(path + 'best_models.pickle')
    
    # plot results
    chart = create_plotly(best_model_results, my_model, target, dataframe_type, top)
    
    if display:
        chart.show()
    
    chart.write_html(path + "best_models.html")
    
    return (best_model_results, chart)

In [12]:
"""
This function generates a plotly plot and chart from feature selection.

Inputs:
best_model_results (dictionary): results from best model 
my_model (string): name of model used 
target (string): CRT target feature (numeric, conceptual, or both)
dataframe_type (string): dropped or imputed
top (integer): plot top number of feature combinations 

Outputs: 
chart (plotly): chart displaying feature selection results
"""
def create_plotly(best_model_results, my_model, target, dataframe_type, top):
        
    # create chart
    chart = go.Figure(data=[go.Table(
    columnorder = [1,2,3, 4],
    columnwidth = [400,80,80,80],
    header=dict(values=['Features', 'Number of Features', 'R Value', 'P Value'],
                fill_color='skyblue',
                align='left'),
    cells=dict(values=[[', '.join(x) for x in best_model_results['features']][:top],
                       [len(x) for x in best_model_results['features']][:top],
                       ["{:.4f}".format(x) for x in best_model_results['r_value']][:top],
                       ["{:.4f}".format(x) for x in best_model_results['p_value']][:top],
                      ],
               fill=dict(color=['snow', 'lightgray', 'lightgray']),
               align='left',
               height=50,
               font_size=10))
    ])
    
    chart.update_layout(
    title={
            'text': "Best Models (Model: {}; Target: {}; Data: {})".format(my_model, target, dataframe_type),
            'y':.89,
            'x':0.5,
            'font': dict(
                size=22,
            ),
            'xanchor': 'center',
            'yanchor': 'top'},
    font=dict(
            family="Courier New, monospace",
            color="black",
        )
    )

    return chart

# Best Model (Degree 2)

In [13]:
"""
This function applies PolynomaialFeatures with degree 2 to the best model

Inputs: 
data (pandas df): master dataframe 
target (string): CRT target feature (numeric, conceptual, or both)
path (string): path to best model results folder
model (string): name of model of choice 
params (dictionary): parameters for pipeline
iters (integer): number of iterations to perform Ridge + cross validation

Outputs: 
results (dictionary): results from model, containing information on {feature names, predictor names, 
                                                                    r value, p value, best params}
"""

def get_best_model_polynomial(data, best_model_results, target, path, model, 
                               params, iters, random_state=None):
    
    # features are the predictors from the best model 
    best_model_results = best_model_results.reset_index()
    feats = best_model_results.loc[0]['predictors']
    
    # edit params to be degree 2 
    params['poly__degree'] = [2]
    
    r_avg, p_avg, score_avg, best_params = model_pipeline(data, feats, target, model, params, 
                                                          iters, randomsearch=False, random_state=None)
   
    with open(path + 'best_models_d2.pickle', 'wb') as f:
        pickle.dump([r_avg, p_avg, score_avg, best_params], f)
    
    return (r_avg, p_avg)