# This script performs hyperparameter tuning for feature extraction (TF-IDF) and dimensionality reduction (TruncatedSVD) methods for text features. The text features are domains, mentions, hashtags, followees, and text (Tweets and Retweets).

In [2]:
# general
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import pyarrow.parquet as pq
import pyarrow as pa
import os

# extraction methods
from sklearn.feature_extraction.text import TfidfVectorizer

# models
from sklearn.linear_model import Ridge

# reduction methods 
from sklearn.decomposition import TruncatedSVD

# scaler
from sklearn.preprocessing import StandardScaler

# pipeline components
from scipy.stats import pearsonr
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Model Tuning

In [28]:
"""
This function runs the Ridge model with cross-validation on selected dataframe and selected features.

Inputs:
data (pandas dataframe): target dataframe 
feat (string): single target feature
target (string): CRT target feature (numeric, conceptual, or both)
iters (integer): number of iterations to perform Ridge + cross validation
my_min (int, float): TF-IDF min_df parameter, default set to 10
my_components (integer): TruncatedSVD n_components parameter, default set to 20 

Outputs:
r_avg (float): r value from Pearson correlation, averaged across all iterations
p_avg (float): p value from Pearson correlation, averaged across all iterations
best_params (dictionary): best Ridge model parameters 
"""

def model_tuning(data, feat, target, iters=1, my_components=200, my_min=1):
    
    total_r = 0
    total_p = 0
    
    # choose TF-IDF vectorizer parameter depending on input
    vectorizer = TfidfVectorizer(ngram_range=(1,1), max_df = 1.0, min_df=my_min, use_idf=True,binary=False)

    svd = TruncatedSVD(n_components=my_components, random_state=17)
        
    pipe = Pipeline([
        ('model', Ridge()),
    ])
    
    # set X and Y where X is transformed feature and Y is CRT score
    X = svd.fit_transform(vectorizer.fit_transform(data[feat]))
    Y = data[target]
    
    # stratify CRT scores 
    bin_count = 0
    for i in data[target].value_counts() > 1:
        if i:
            bin_count += 1      
    bin_count -= 1

    bins = np.linspace(0, 1, bin_count)
    y_binned = np.digitize(Y, bins)
    
    # run Ridge regression with cross-validation for iters and average results
    for num in range(iters):
        X_train, X_test, Y_train, Y_test  = train_test_split(X, Y, test_size=0.2, stratify=y_binned)

        grid = GridSearchCV(pipe, param_grid={'model__alpha': np.logspace(-5,5,100)}, cv=5, n_jobs=1, verbose=0, scoring='neg_mean_squared_error')
        grid.fit(X_train, Y_train)
        y_pred = grid.predict(X_test)
        y_pred=np.maximum(0, np.minimum(y_pred, 1))

        r, p = (pearsonr(Y_test, y_pred))
        total_r += r
        total_p += p
    
    r_avg = total_r/iters
    p_avg = total_p/iters
    best_params = grid.best_params_
    
    return (r_avg, p_avg, best_params)

# Tune Number of Components (TruncatedSVD)

In [27]:
"""
This function tunes n_components parameter in TruncatedSVD.

Inputs:
df (pandas df): master dataframe
feature (string): single target feature name
target (string): CRT target feature (numeric, conceptual, or both)
start (integer): n_components start value 
end (integer): n_components end value 
interval (integer): skip value 
iterations (integer): number of iterations to perform Ridge + cross validation
results_folder (string): name of folder to save results 

Outputs:
history (dictionary): history of tuning, containing history of {n_components, r value, 
                      p value, and alpha value from Ridge regression}
"""
def tune_feature(df, feature, target, start, end, interval, iterations, results_folder):
    
    # initialize history 
    history = {
        'n_components': [],
        'r_value': [],
        'p_value': [],
        'alpha': []
    }
    
    count = 0
    progress_benchmark = (float(end - start) / interval * 0.25)
    
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("Starting {}...".format(feature))
    
    # run model tuning for ever n_components within start-end range with interval value
    for n in range(start, end, interval):

        # print progress
        if count >= progress_benchmark:
            print("{:0.2f}% complete".format(100*(n - start) / float(end - start)))
            progress_benchmark += (float(end - start) / interval * 0.25)

        df_dropped = df[df[feature].notnull()] # drop any rows that have NaN values 
        r, p, params = model_tuning(df_dropped, feature, target, iters=iterations, my_components=n)

        # save results to history
        history['n_components'].append(n)
        history['r_value'].append(r)
        history['p_value'].append(p)
        history['alpha'].append(params['model__alpha'])
        
        count += 1
    
    # Check whether the specified path exists or not
    isExist = os.path.exists(results_folder)
    if not isExist:
        os.makedirs(results_folder)
    
    # create results path with feature name
    results_path = results_folder + "/{}.pickle".format(feature)
    
    # save history
    with open(results_path, 'wb') as handle:
        pickle.dump(history, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    return history 

In [42]:
"""
This function annotates the maximum y value on a graph. 

Inputs:
x (list): all x values
y (list): all y values
xlabel (string): n_components or min_df 

Outputs: 
None

Source: https://stackoverflow.com/questions/43374920/how-to-automatically-annotate-maximum-value-in-pyplot/43375405
"""

def annot_max(x,y, xlabel, ax=None):
    xmax = x[np.argmax(y)]
    ymax = max(y)
    text= "{}={:.0f}, r={:.3f}".format(xlabel, xmax, ymax)
    if not ax:
        ax=plt.gca()
    bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
    arrowprops=dict(arrowstyle="->",connectionstyle="angle,angleA=0,angleB=60")
    kw = dict(xycoords='data',textcoords="axes fraction",
              arrowprops=arrowprops, bbox=bbox_props, ha="right", va="top")
    ax.annotate(text, xy=(xmax, ymax), xytext=(0.94,0.96), **kw)

In [4]:
"""
This function plots the tuning history and saves plot.

Inputs:
history (dictionary): tuning history
x (string): name of target parameter, either min_df or n_components
feature (string): name of feature

Outputs: 
None
"""
def get_r_plot(history, x, feature, results_folder):
    plt.plot(history[x], history['r_value'])
    plt.title("r value by {} ({})".format(x, feature))
    plt.xlabel(x, fontsize=12)
    plt.ylabel('r value', fontsize=12)
    
    # Check whether the specified path exists or not
    isExist = os.path.exists(results_folder)
    if not isExist:
        os.makedirs(results_folder)
    
    annot_max(history[x], history['r_value'], x)
    plt.savefig(results_folder + "{}.png".format(feature))
    
    return plt

In [5]:
"""
This function gets the top n_components or min_df by r value.

Inputs:
history (dictionary): dictionary of tuning history
x (string): name of target parameter, either min_df or n_components
k (integer): number of top results to return

Outputs: 
top_values (list): top k values 
"""
def get_top_components(history, x, k=10):
    
    top = sorted(range(len(history['r_value'])), key=lambda i: history['r_value'][i])
    top.reverse()
    best_indices = top[:k]
    
    top_values = [history[x][c] for c in best_indices]
    
    return top_values

In [6]:
"""
This function gets the best n_component or min_df for a given feature.

Inputs:
feature (string): single feature name 

Outputs: 
best_component (list): n_component values that produces the highest r value
"""
def get_optimal_mappings(feature, results_folder, parameter):
    
    # create results path with feature name
    results_path = results_folder + "{}.pickle".format(feature)
     
    with open(results_path, 'rb') as pickle_file:
        history = pickle.load(pickle_file)
        
    best_component = get_top_components(history, parameter, k=10)
        
    return best_component

# Tune Min DF (TFIDF)

In [22]:
"""
This function gets the maximum min_df given best n_components for TruncatedSVD

Inputs:
data (pandas df): master dataframe
feat (string): single target feature
my_components (integer): best n_components for TruncatedSVD given tuning
low (integer): lowest value for min_df
high (integer): highest value for min_df

Outputs:
my_min (integer): maximum min_df for TF-IDF to try in order to keep appropriate dimensions
"""

def get_max_min(data, feat, my_components, low=1, high=1000):
     
    if high - low <= 1: # difference of 1 
        my_min = low 
        return low
    
    my_min = (high + low) // 2

    vectorizer = TfidfVectorizer(ngram_range=(1,1),max_df= 1.0,min_df = my_min,use_idf=True, binary=False)
    data_dropped = data[data[feat].notnull()]
    
    try:
        out = vectorizer.fit_transform(data_dropped[feat])
    except:
        return get_max_min(data_dropped, feat, my_components, low, my_min - 1)
    
    n_features = len(vectorizer.get_feature_names_out())

    if my_components < n_features:
        return get_max_min(data_dropped, feat, my_components, my_min + 1, high)
    else: 
        return get_max_min(data_dropped, feat, my_components, low, my_min - 1)

In [8]:
"""
This function tunes min_df in TF-IDF.

Inputs:
df (pandas df): master dataframe
feature (string): single target feature name
target (string): CRT target feature (numeric, conceptual, or both)
svd (integer): n_components value for TruncatedSVD
start (integer): n_components start value 
end (integer): n_components end value 
increments (integer): number of min_df to test
iterations (integer): number of iterations to perform Ridge + cross validation
results_folder (string): name of folder to save results 

Outputs:
history (dictionary): tuning history, containing {min_df and r value}
"""

def tune_min_df(df, feature, target, svd, start, end, increments, iterations, results_folder):
    
    history = {
        'r_value': [],
        'min_df': []
    }
    
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("Starting {}...".format(feature))
    
    if increments >= (end - start):
        interval = 1
    else:
        interval = (end - start) // increments
    
    count = 0
    progress_benchmark = (float(end - start) / interval * 0.25)
    
    # for min_df in range start to end 
    for val in range(start, end, interval):
        
        if count >= progress_benchmark:
            print("{:0.2f}% complete".format(100*(val - start) / float(end - start)))
            progress_benchmark += (float(end - start) / interval * 0.25)
        
        df_dropped = df[df[feature].notnull()] # drop any rows that have NaN values 
        r, p, params = model_tuning(df_dropped, feature, target, iters=iterations, my_components=svd, my_min=val)    
        
        # save history 
        history['r_value'].append(r)
        history['min_df'].append(val)
        
        count += 1
    
    # Check whether the specified path exists or not
    isExist = os.path.exists(results_folder)
    if not isExist:
        os.makedirs(results_folder)
        
    # create results path with feature name
    results_path = results_folder + "/{}.pickle".format(feature)

    # save history
    with open(results_path, 'wb') as handle:
        pickle.dump(history, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
    return history