# This file contains all the functions relevant for the model.

In [3]:
# general functions
import numpy as np
import pandas as pd
import random
import statistics
import copy
import math
import re
import os
import pyarrow.parquet as pq
import pyarrow as pa
import warnings
warnings.filterwarnings("ignore")

# model functions
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from scipy.stats import pearsonr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import scale
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import ElasticNetCV
from sklearn.preprocessing import StandardScaler

# plot functions
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

# Split Data 

In [1]:
"""
This function generates a list of splits for the pipeline.

Inputs:
my_splits (list or integer): list of splits or number of random splits 

Outputs:
splits (list): list of integers representing splits for pipeline
"""
def get_splits(my_splits):
    if type(my_splits) == int:
        splits = []
        for i in range(my_splits):
            n = random.randint(0, 999999)
            splits.append(n)
    else:
        splits = my_splits
    
    return splits

In [239]:
"""
This function splits the data into a train and test set. 

Inputs:
X (array): features  
Y (array): target variable 
target (string): name of target variable 
my_split (float): size of test set out of 1.0
my_state (integer): random state for split

Ouputs:
X_train (array): train set of features
X_test (array): test set of features
Y_train (array): train set of target 
Y_test (array): test set of target
"""

def split_data(X, Y, target, my_split, my_state):
    
    # stratify CRT scores 
    bin_count = 0
    for i in Y.value_counts() > 1:
        if i:
            bin_count += 1      
    bin_count -= 1

    bins = np.linspace(0, 1, bin_count)
    y_binned = np.digitize(Y, bins)
    
    X_train, X_test, Y_train, Y_test  = train_test_split(X,Y, test_size=my_split, stratify=y_binned, 
                                                         random_state=my_state)
    
    return X_train, X_test, Y_train, Y_test

# Transform Features

In [1161]:
"""
This function transforms features by performing feature extraction and
dimensionality reduction on text features and log transformation (if applicable) and standardization on
quantitative features.

Inputs:
df (pandas dataframe): dataframe containing features 
feature (string): name of feature to be transformed
target (string): name of target variable (CRT score)
my_min_df (integer): TF-IDF min_df parameter 
my_n_components (integer): TruncatedSVD n_components parameter

Outputs:
X (array): transformed feature 
Y (array): target variable 
na_count (float): number of NaN values for feature  
"""
def transform_feature(df, feature, target, my_max_df=None, my_min_df=None, my_n_components=None):
    
    na_count = df[feature].isna().sum()
    df = df.dropna(subset=[feature])
    
    scale = StandardScaler()
    
    if feat in ['text', 'bio', 'follower_bios', 'followee_bios']:
        my_binary = False
    else:
        my_binary = True
    if feat == 'bio':
        my_min_df = 5

    if df[feature].dtype in [float, int, np.float64, np.int64]:

        skew = df[feature].skew(axis=0) # check if high or medium skew 

        if -0.5 > skew or 0.5 < skew:
            constant = abs(df[feature].min()) + 0.001
            X_ = df[feature].apply(lambda x: np.log(x + constant)).to_numpy()

        else:
            X_ = df[feature].to_numpy()

        X = scale.fit_transform(X_.reshape(-1,1))

    else:
        if feature in ['text', 'followee_bios', 'follower_bios']:
            my_token_pattern=r'(?ui)\b\w[a-z]\w[a-z]+\b'
        else:
            my_token_pattern=r'(?u)\b\w\w+\b'
            
        tfidf = TfidfVectorizer(ngram_range=(1,1),max_df= my_max_df,min_df = my_min_df,binary=my_binary, token_pattern=my_token_pattern)
        svd = TruncatedSVD(n_components=my_n_components, random_state=17)
        X = scale.fit_transform(svd.fit_transform(tfidf.fit_transform(df[feature])))

    Y = df[target]

    return X, Y, na_count

# Feature Selection

In [248]:
"""
This function performs feature selection using ElasticNetCV.

Inputs:
X_train (array): train set of features
X_test (array): test set of features
Y_train (array): train set of target 
Y_test (array): test set of target

Outputs:
X_train (array): train set of features after feature selection
X_test (array): test set of features after feature selection
Y_train (array): train set of target after feature selection
Y_test (array): test set of target after feature selection
"""

def feature_selection(X_train, X_test, Y_train, Y_test, thresh):
    sfm_selector = SelectFromModel(estimator=ElasticNetCV(), threshold=thresh)
    sfm_selector.fit(X_train, Y_train)

    X_train_df = pd.DataFrame(X_train)
    X_test_df = pd.DataFrame(X_test)

    X_train = X_train_df[X_train_df.columns[sfm_selector.get_support()]].to_numpy()
    X_test = X_test_df[X_train_df.columns[sfm_selector.get_support()]].to_numpy()

    return (X_train, X_test, Y_train, Y_test)

# Prediction

In [1156]:
"""
This function fits a model to training data with cross-validation and predicts CRT score using the model. 

Inputs: 
X_train (array): train set of features
X_test (array): test set of features
Y_train (array): train set of target 
Y_test (array): test set of target
my_model (string): name of model 
my_params (dictionary): dictionary of pipeline parameters 

Outputs:
results (dictionary): dictionary of results, containing best r value, p value, and R^2 score from GridSearch
"""
def predict(X_train, X_test, Y_train, Y_test, my_model, my_params, my_cross_val):
    
    models = {'ridge': Ridge(random_state= 17), 
                'lasso': Lasso(random_state= 17), 
                'rfr': RandomForestRegressor(random_state= 17)}
    
    pipe = Pipeline([
        ('poly', PolynomialFeatures(degree=1)),
        ('model', models[my_model]),
    ])

    grid = GridSearchCV(pipe, param_grid=my_params, cv=my_cross_val, n_jobs=-1, verbose=0, scoring='r2')
    grid.fit(X_train, Y_train)

    y_pred = grid.predict(X_test)
    y_pred=np.maximum(0,np.minimum(y_pred, 1))
    
    r, p = pearsonr(Y_test, y_pred) 
    
    results = {"r": abs(r), "p": p, "r2": grid.best_score_}

    return results


# Create and Save Results

In [4]:
"""
This function generates statistics from the results.

Inputs: 
feat (string): feature name
na_count (integer): number of dropped users
total_r (list): list of r values from prediction
total_p (list): list of p values from prediction
total_score (list): list of R^2 scores from prediction

Outputs:
f_results (dictionary): dictionary of statistics of results (mean, median, range)
"""
def create_results(feat, na_count, total_r, total_p, total_score):

    f_results = {}
    f_results['r_mean'] = statistics.mean(total_r)
    f_results['p_mean'] = statistics.mean(total_p)
    f_results['feature'] = feat
    f_results['na_count'] = na_count
    f_results['r2_mean'] = statistics.mean(total_score)
    f_results['r2_median'] = statistics.median(total_score)
    f_results['r_median'] = statistics.median(total_r)
    f_results['p_median'] = statistics.median(total_p)
    f_results['r2_range'] = [min(total_score), max(total_score)]
    f_results['r_range'] = [min(total_r), max(total_r)]
    f_results['p_range'] = [min(total_p), max(total_p)]
    f_results['r_history'] = total_r
    f_results['p_history'] = total_p
    f_results['score_history'] = total_score
    
    return f_results

In [None]:
"""
This function saves the results to the appropriate folder.

Inputs:
results_folder (string): path to results folder
results_name (string): name of results 
df (pandas dataframe): dataframe of results
plot (plotly): plot of results
chart (plotly): chart of results
states (dictionary): dictionary of parameters

Outputs:
None
"""

def save_results(results_folder, results_name, df, plot, chart, states):
    results_name = results_name.replace(" ", "_")
    # save results and plots 
    isExist = os.path.exists(results_folder)
    if not isExist:
        os.makedirs(results_folder)  
    df.to_pickle(results_folder + '{}_df.pickle'.format(results_name))
    plot.write_html(results_folder + "{}_plot.html".format(results_name))
    chart.write_html(results_folder + "{}_chart.html".format(results_name))

    # save states
    with open(results_folder + '{}.txt'.format(results_name), 'w') as f:
         print(states, file=f)
    
    return None

# Combine Features

In [1157]:
"""
This function combines individual features to create the best model.

Inputs:
feature_dict (dictionary): dictionary mapping feature name to list of train and test data after feature selection
individual_df (pandas dataframe): results from individual features run, ranking features from most predictive to least predictive 
target (string): name of target variable (CRT score)
my_model (string): name of model 
my_params (dictionary): dictionary of pipeline parameters 

Outputs:
combined_results_df (pandas dataframe): dataframe displaying feature combinations and respective r values, p values and R^2 scores
"""

def combine_features(feature_dict, individual_df, target, my_model, my_params):
    individual_df = individual_df.sort_values('r', ascending=False, ignore_index=True)
    combined_results_df = pd.DataFrame({})
    features = []
    
    X_train = np.array([])
    X_test = np.array([])
    Y_train = np.array([])
    Y_test = np.array([])
    
    for i, row in individual_df.iterrows():
        
        feature = row['feature']
        features.append(feature)
                
        if i != 0:
            X_train_i, X_test_i, Y_train_i, Y_test_i = feature_dict[feature]
        
            # concat new feature with old features 
            X_train = np.hstack([X_train, X_train_i])
            X_test = np.hstack([X_test, X_test_i])
            
        else:
            X_train, X_test, Y_train, Y_test = feature_dict[feature]
                
        results = predict(X_train, X_test, Y_train, Y_test, my_model, my_params, 10)
        
        f_results = {
            'features': copy.deepcopy(features),
            'r': results['r'],
            'p': results['p'],
            'r2': results['r2']
        }
        
        combined_results_df = combined_results_df.append(f_results, ignore_index = True)
    
    combined_results_df = combined_results_df.sort_values('r', ascending=False, ignore_index=True)
    
    return combined_results_df

# Informative Features

In [431]:
"""
This function performs feature extraction using TF-IDF, runs Ridge with cross-validation, and prints most informative features.

Inputs:
data (pandas dataframe): dataframe containing features 
feat (string): name of feature
target (string): name of target variable (CRT score)
results_folder (string): path to informative features results folder
n (integer): number of top features to show
maxDF (float): TF-IDF max_df parameter
minDF (float): TF-IDF min_df parameter
n_gram ((integer, integer)): TF-IDF n_gram range parameter 
my_state (integer): random state for train/test split

Outputs
r (float): Pearson r correlation coefficient after Ridge and cross-validation
p (float): p value after Ridge and cross-validation
score (float): best R^2 score from GridSearchCV
chart (plotly): chart displaying informative features and their coefficients 
"""
def get_informative_features(data, feat, target, results_folder, n=10, maxDF=1.0, minDF=10, n_gram=(1,1), my_state=17):
    
    # create coefficient dictionary 
    coefs_dict = {
        'high_coefs': [],
        'high_names': [],
        'low_coefs': [],
        'low_names': []
    }
    
    if feat == 'text':
        vectorizer = TfidfVectorizer(ngram_range=n_gram, max_df= maxDF, min_df=minDF, use_idf=True,binary=False, analyzer='word', token_pattern=r'(?ui)\b\w[a-z]\w[a-z]\w[a-z]+\b')
    elif feat == "followees": 
        vectorizer = TfidfVectorizer(ngram_range=n_gram,max_df= maxDF,min_df = minDF,use_idf=True,binary=True)
    elif feat == 'domains': 
        vectorizer = TfidfVectorizer(ngram_range=n_gram,max_df= maxDF,min_df = minDF,use_idf=True,binary=True)
    elif feat == 'hashtags': 
        vectorizer = TfidfVectorizer(ngram_range=n_gram,max_df= maxDF,min_df = minDF,use_idf=True,binary=True)
    elif feat == 'mentions': 
        vectorizer = TfidfVectorizer(ngram_range=n_gram,max_df=maxDF,min_df = minDF,use_idf=True,binary=True)
    elif feat in ['bio', 'follower_bios', 'followee_bios']:
        vectorizer = TfidfVectorizer(ngram_range=n_gram,max_df=maxDF,min_df = minDF,use_idf=True,binary=False, analyzer='word', token_pattern=r'(?ui)\b\w[a-z]\w[a-z]+\b')
    
    params = {'alpha': np.logspace(-5, 5, 100)}
    
    data = data[data[feat].notnull()]
    X_text = vectorizer.fit_transform(data[feat]) 
    Y = data[target]
    
    bins = np.linspace(0, 1, 7)
    y_binned = np.digitize(Y, bins)
    
    # run Ridge regression + cross-validation
    X_train, X_test, Y_train, Y_test  = train_test_split(X_text, Y, test_size=0.1, stratify=y_binned, random_state=my_state)

    grid = GridSearchCV(Ridge(random_state=17), param_grid=params, cv=5, n_jobs=-1, verbose=0, scoring='r2')
    grid.fit(X_train, Y_train)
    y_pred = grid.predict(X_test)
    y_pred=np.maximum(0, np.minimum(y_pred, 1))

    coefs_with_fns = sorted(zip(grid.best_estimator_.coef_, vectorizer.get_feature_names_out()))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        coefs_dict['low_coefs'].append(coef_1)
        coefs_dict['low_names'].append(fn_1)
        coefs_dict['high_coefs'].append(coef_2)
        coefs_dict['high_names'].append(fn_2)

    r, p = (pearsonr(Y_test, y_pred))
    score = grid.best_score_

    chart = create_plotly_informative(coefs_dict, feat, target, r, p, minDF, maxDF, n_gram)
        
    isExist = os.path.exists(results_folder)
    if not isExist:
        os.makedirs(results_folder)
        
    chart.write_html(results_folder + "{}_{}.html".format(feat, n_gram))
    
    return (r, p, score, chart)


# Create Plotly Charts and Plots

In [226]:
"""
This function generates a plotly plot and chart for individual feature results.

Inputs:
df (pandas dataframe): dataframe with individual feature results
target (string): name of target variable 
dataframe_type (string): dropped, imputed, or full 
my_model (string): name of model 

Outputs: 
fig (plotly): plot displaying individual feature results
chart (plotly): chart displaying individual feature results
"""
def create_plotly_individual(df, target, dataframe_type, my_model, title):
    
    textfeat = ['mentions', 'text', 'domains', 'bio', 'followees', 'follower_bios', 'followee_bios',
           'hashtags']
    
    # create plot
    df = df.dropna(subset=['r_median'])
    df_plot = df.sort_values(by=["r_median"], ascending=True)
    df_plot_reverse = df.sort_values(by=["r_median"], ascending=False)
    
    fig = go.Figure(go.Bar(
                y=df_plot['feature'],
                x=df_plot['r_median'],
                orientation='h',
                text=df_plot['r_median'].apply(lambda x: "{:.2f}".format(x)),
                textposition='auto',
                marker_color=['cornflowerblue' if col in textfeat else 'lightslategray' for col in df_plot['feature']]
    ))

    fig.update_layout(
        yaxis_title="Feature Name",
        xaxis_title="R Median",
        font=dict(
            family="Courier New, monospace",
            size=10,
            color="black"
        ),
        yaxis=dict(
        tickmode='linear'),

        width=1000, height=800,

        title={
            'text': "{} (Target: {}; Data: {}; Model: {})".format(title, target, dataframe_type, my_model),
            'y':.92,
            'x':0.5,
            'font': dict(
                size=22,
            ),
            'xanchor': 'center',
            'yanchor': 'top'}
    )

    fig.update_yaxes(title_font_size=15)
    fig.update_xaxes(title_font_size=15)

    fig.update_traces(textposition='outside', textfont_size=10)
    
    # create chart
    chart = go.Figure(data=[go.Table(
    header=dict(values=['Feature Name', 'R Median', 'P Median', 'R^2 Median', 'Dropped Count'],
                fill_color='cornflowerblue',
                align='left'),
    cells=dict(values=[df_plot_reverse.feature, 
                       df_plot_reverse.r_median.apply(lambda x: "{:.5f}".format(x)),
                      df_plot_reverse.p_median.apply(lambda x: "{:.5f}".format(x)), 
                       df_plot_reverse.r2_median.apply(lambda x: "{:.5f}".format(x)), 
                       df_plot_reverse.na_count],
               fill_color='lightgray',
               align='left'))
    ])
    
    chart.update_layout(
    title={
            'text': "{} (Target: {}; Data: {}; Model: {})".format(title, target, dataframe_type, my_model),
            'y':.89,
            'x':0.5,
            'font': dict(
                size=22,
            ),
            'xanchor': 'center',
            'yanchor': 'top'},
    font=dict(
            family="Courier New, monospace",
            color="black",
            size=12
        )
    )
    
    return (fig, chart)

In [227]:
"""
This function generates a plotly plot and chart for combined feature results.

Inputs:
df (pandas dataframe): dataframe with combined feature results
target (string): name of target variable 
dataframe_type (string): dropped, imputed, or full 
my_model (string): name of model 

Outputs: 
chart (plotly): chart displaying combined feature results
"""
def create_plotly_combined(df, target, dataframe_type, my_model):
    
    textfeat = ['mentions', 'text', 'domains', 'bio', 'followees', 'follower_bios', 'followee_bios',
           'hashtags']
    
    df = df.dropna(subset=['r'])
    df_plot_reverse = df.sort_values(by=["r"], ascending=False)
    
    # create chart
    chart = go.Figure(data=[go.Table(
    header=dict(values=['Features', 'R Value', 'P Value', 'R^2'],
                fill_color='cornflowerblue',
                align='left'),
    cells=dict(values=[[', '.join(x) for x in df_plot_reverse.features],
                       df_plot_reverse.r.apply(lambda x: "{:.5f}".format(x)),
                      df_plot_reverse.p.apply(lambda x: "{:.5f}".format(x)), 
                      df_plot_reverse.r2.apply(lambda x: "{:.5f}".format(x))],
               fill_color='lightgray',
               align='left'))
    ])
    
    chart.update_layout(
    title={
            'text': "Combined Features (Target: {}; Data: {}; Model: {})".format(target, dataframe_type, my_model),
            'y':.89,
            'x':0.5,
            'font': dict(
                size=22,
            ),
            'xanchor': 'center',
            'yanchor': 'top'},
    font=dict(
            family="Courier New, monospace",
            color="black",
            size=12
        )
    )
    
    return chart

In [487]:
"""
This function generates a plotly chart from informative features module.

Inputs:
coefs_dict (dictionary): contains four keys (high_coefs, high_names, low_coefs, low_names)
feat (string): name of feature
r (float): Pearson r correlation coefficient after Ridge and cross-validation
p (float): p value after Ridge and cross-validation
min_df (float): TF-IDF min_df parameter
max_df (float): TF-IDF max_df parameter
n_gram ((integer, integer)): TF-IDF n_gram range parameter 

Outputs: 
fig (plotly): chart displaying feature selection results
"""
def create_plotly_informative(coefs_dict, feat, target, r, p, min_df, max_df, n_gram):
    
    # create chart
    chart=[go.Table(
        header=dict(values=['Terms (Low CRT Score)', 'Coefficients',  
                         'Terms (High CRT Score)', 'Coefficients', ],
                fill_color='papayawhip',
                align='left'),
    cells=dict(values=[coefs_dict['low_names'], ["{:.4f}".format(x) for x in coefs_dict['low_coefs']], 
                       coefs_dict['high_names'], ["{:.4f}".format(x) for x in coefs_dict['high_coefs']]],
               fill=dict(color=['lightgray', 'snow', 'lightgray', 'snow']),
               align='left'))]
    
    layout = go.Layout(
    height=700,
    width=700,
        
    annotations=[
        go.layout.Annotation(
            showarrow=False,
            text='min_df: {}, max_df: {}, n_gram: {}, r: {:.3f}, p value: {:.3f}'.format(min_df, max_df, n_gram, r, p),
            xanchor='center',
            x=.5,
            yanchor='bottom',
            y=1.03
        )])
    
    fig = go.FigureWidget(data=chart, layout=layout)
    
    fig.update_layout(
    title={
            'text': "Informative Features for {}".format(feat),
            'y':.93,
            'x':0.5,
            'font': dict(
                size=17,
            ),
            'xanchor': 'center',
            'yanchor': 'top'},
    font=dict(
            family="Courier New, monospace",
            color="black",
            size=12
        )
    )

    return fig