In [9]:
# general packages
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import pyarrow.parquet as pq
import pyarrow as pa
import numpy as np
import pandas as pd 
import os

# pipeline components
from sklearn.linear_model import Ridge
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV


In [108]:
def individual_features(data_train, data_test, target, selected_feats, drop=False):
        
    dropped_count_train = max(data_train[selected_feats].isna().sum())
    dropped_count_test = max(data_test[selected_feats].isna().sum())
    
    if drop:
        data_train = data_train.dropna(subset=selected_feats)
        data_test = data_test.dropna(subset=selected_feats)
        
    X_train = data_train[selected_feats]
    Y_train = data_train[target]
    
    X_test = data_test[selected_feats]
    Y_test = data_test[target]

    grid = GridSearchCV(Ridge(), param_grid={'alpha': np.logspace(-5,5,100)}, cv=5, n_jobs=1, verbose=0, scoring='neg_mean_squared_error')

    grid.fit(X_train, Y_train)
    y_pred = grid.predict(X_test)
    y_pred=np.maximum(0, np.minimum(y_pred, 1))
    
    r, p = pearsonr(y_pred, Y_test)
    score = grid.best_score_
    
    return (r, p, score, dropped_count_train, dropped_count_test)

In [109]:
def run_individual_features(data_train, data_test, target, feats_df, drop=False):
    
    results = []
    
    for index, row in feats_df.iterrows():
        r, p, score, dropped_count_train, dropped_count_test = individual_features(data_train, data_test, 
                                                                                   target, 
                                                                                   row['target_feats'], 
                                                                                   drop=drop)
        
        results.append({
        'feature': row['feature'],
        'target_feats': row['target_feats'],
        'ridge_r_value': abs(r),
        'ridge_p_value': p,
        'score': score,
        'dropped_count_train': dropped_count_train,
        'dropped_count_test': dropped_count_test,
        'dropped_count_total': dropped_count_train + dropped_count_test
        })
    
    
    return pd.DataFrame(results) 

In [106]:
"""
This function creates a dataframe organized by decreasing best r value and plots results

Inputs:
selection_df (pandas df): dataframe with feature selection results
path (string): path to feature selection results folder
target (string): CRT target feature (numeric, conceptual, or both)
dataframe_type (string): dropped or imputed
display (boolean): if True, display plot and chart

Outputs:
selection_df (pandas df): dataframe with feature selection results, sorted by decreasing r value
ax (plot): plot of selection results, sorted by decreasing r value
chart (plotly): chart displaying feature selection results
"""
def plot(selection_df_, path, target, dataframe_type, display=True):
    # sort in reverse order r value 
    selection_df = selection_df_.sort_values('ridge_r_value')[::-1]

    isExist = os.path.exists(path)
    if not isExist:
        os.makedirs(path)
    selection_df.to_pickle(path + 'individual_features.pickle')
    
    selection_df_dropped = selection_df.dropna()
    
    # plot results
    ax, chart = create_plotly(selection_df_dropped, target, dataframe_type)
    
    if display:
        ax.show()
        chart.show()
    
    ax.write_html(path + "individual_features_plot.html")
    chart.write_html(path + "individual_features_chart.html")

    return (selection_df, ax, chart)

In [110]:
"""
This function generates a plotly plot and chart from feature selection.

Inputs:
df (pandas df): dataframe with feature selection results, sorted by decreasing r value
target (string): CRT target feature (numeric, conceptual, or both)
dataframe_type (string): dropped or imputed

Outputs: 
fig (plotly): plot displaying feature selection results
chart (plotly): chart displaying feature selection results
"""
def create_plotly(df, target, dataframe_type):
    
    textfeat = ['mentions', 'text', 'domains', 'bio', 'followees', 'follower_bios', 'followee_bios',
           'hashtags']
    
    # create plot
    df_plot = df.sort_values(by=["ridge_r_value"], ascending=True)
    
    fig = go.Figure(go.Bar(
                y=df_plot['feature'],
                x=df_plot['ridge_r_value'],
                orientation='h',
                text=df_plot['ridge_r_value'].apply(lambda x: "{:.2f}".format(x)),
                textposition='auto',
                marker_color=['cornflowerblue' if col in textfeat else 'lightslategray' for col in df_plot['feature']]
    ))

    fig.update_layout(
        yaxis_title="Feature Name",
        xaxis_title="Absolute R Value from Ridge",
        font=dict(
            family="Courier New, monospace",
            size=10,
            color="black"
        ),
        yaxis=dict(
        tickmode='linear'),

        width=1000, height=800,

        title={
            'text': "Individual Features (Target: {}; Data: {})".format(target, dataframe_type),
            'y':.92,
            'x':0.5,
            'font': dict(
                size=22,
            ),
            'xanchor': 'center',
            'yanchor': 'top'}
    )

    fig.update_yaxes(title_font_size=15)
    fig.update_xaxes(title_font_size=15)

    fig.update_traces(textposition='outside', textfont_size=10)
    
    # create chart
    chart = go.Figure(data=[go.Table(
    header=dict(values=['Features', 'R Value', 'P Value', 'Dropped Count (Train)', 'Dropped Count (Test)'],
                fill_color='cornflowerblue',
                align='left'),
    cells=dict(values=[df.feature, df.ridge_r_value.apply(lambda x: "{:.5f}".format(x)),
                      df.ridge_p_value.apply(lambda x: "{:.5f}".format(x)),
                      df.dropped_count_train, df.dropped_count_test],
               fill_color='lightgray',
               align='left'))
    ])
    
    chart.update_layout(
    title={
            'text': "Individual Features (Target: {}; Data: {})".format(target, dataframe_type),
            'y':.89,
            'x':0.5,
            'font': dict(
                size=22,
            ),
            'xanchor': 'center',
            'yanchor': 'top'},
    font=dict(
            family="Courier New, monospace",
            color="black",
            size=12
        )
    )

    return (fig, chart)