In [2]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree as tr
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from xgboost import XGBRegressor

In [3]:
def load_csvs_to_dfs(filenames):
    dataframes = []
    for name in filenames:
        file_path = f'../data/{name}.csv'
        try:
            df = pd.read_csv(file_path)
            dataframes.append(df)
            print(f'Loaded {file_path} into DataFrame: {name}')
        except FileNotFoundError:
            print(f'File {file_path} not found.')
    return dataframes

csvs = [
        'cesd_total', 
        'gad_total',
        'inq_perceivedburden',
        'inq_thwartedbelong',
        'upps_total']
dataframes = load_csvs_to_dfs(csvs)

Loaded ../data/cesd_total.csv into DataFrame: cesd_total
Loaded ../data/gad_total.csv into DataFrame: gad_total
Loaded ../data/inq_perceivedburden.csv into DataFrame: inq_perceivedburden
Loaded ../data/inq_thwartedbelong.csv into DataFrame: inq_thwartedbelong
Loaded ../data/upps_total.csv into DataFrame: upps_total


In [4]:
def plot_outcome(df: pd.DataFrame):
    # Extract the last column
    last_column_name = df.columns[-1]
    
    # Create the plot
    plt.figure(figsize=(10, 6))
    plt.hist(df[last_column_name], bins=50)
    plt.title(f'Histogram of {last_column_name}')
    plt.ylabel(last_column_name)
    plt.grid(True)

    plt.savefig(f'../plots/hist_{last_column_name}.png')

In [5]:
def plot_correlations(df: pd.DataFrame):
    last_column_name = df.columns[-1]
    sns.clustermap(df.corr(), cmap='viridis')
    plt.savefig(f'../plots/corr_{last_column_name}.png')

In [6]:
def create_splits(df:pd.DataFrame):
    X = df.iloc[:,:-1].values
    y = df.iloc[:,-1].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, 
        y, 
        test_size=0.15, 
        random_state=42)

    X_train, X_val, y_train, y_val = train_test_split(
        X_train, 
        y_train,
        test_size=X_test.shape[0] / X_train.shape[0],  
        random_state=42)

    return X_train, X_val, X_test, y_train, y_val, y_test

In [7]:
def identify_variable_types(data):
    continuous_vars = []
    dummy_vars = []

    num_columns = data.shape[1]
    
    for i in range(num_columns):
        unique_values = np.unique(data[:, i])
        if len(unique_values) == 2 and np.array_equal(unique_values, [0, 1]):
            dummy_vars.append(i)
        else:
            continuous_vars.append(i)
    
    return continuous_vars, dummy_vars

In [8]:
def transform_X(split):

    continuous_cols, dummy_cols = identify_variable_types(split)
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), continuous_cols),
            ('dummy', 'passthrough', dummy_cols)  # Leave dummy variables unchanged or use StandardScaler() if needed
        ]
    )
    split = preprocessor.fit_transform(split)

    return split

In [26]:
@run_on_splits
def evaluate(model, X, y, nsplit, model_name, constant_value=None):
    ''' Evaluates the performance of a model 
    Args:
        model (sklearn.Estimator): fitted sklearn estimator
        X (np.array): predictors
        y (np.array): true outcome
        nsplit (str): name of the split
        model_name (str): string id of the model
        constant_value (int or None): relevant if the model predicts a constant
    '''
    if constant_value is not None:
        preds = np.array([constant_value] * y.shape[0])
    else:
        preds = model.predict(X)
    r2 = r2_score(y, preds)
    performance = np.sqrt(mean_squared_error(y, preds))
    model_performances.append({'model': model_name,
                         'split': nsplit,
                         'rmse': round(performance, 4),
                         'r2': round(r2, 4)})

In [30]:
def loop_through_dfs(list_of_dfs):

    def run_on_splits(func):
        def _run_loop(*args, **kwargs):
            for x,y,nsplit in zip([X_train, X_val, X_test],
                                [y_train, y_val, y_test],
                                ['train', 'val', 'test']):
                func(*args, X=x, y=y, nsplit=nsplit, **kwargs)
        return _run_loop

    @run_on_splits
    def evaluate(model, X, y, nsplit, model_name, constant_value=None):
        ''' Evaluates the performance of a model 
        Args:
            model (sklearn.Estimator): fitted sklearn estimator
            X (np.array): predictors
            y (np.array): true outcome
            nsplit (str): name of the split
            model_name (str): string id of the model
            constant_value (int or None): relevant if the model predicts a constant
        '''
        if constant_value is not None:
            preds = np.array([constant_value] * y.shape[0])
        else:
            preds = model.predict(X)
        r2 = r2_score(y, preds)
        performance = np.sqrt(mean_squared_error(y, preds))
        model_performances.append({'model': model_name,
                            'split': nsplit,
                            'rmse': round(performance, 4),
                            'r2': round(r2, 4)})
    
    for idx, df in enumerate(list_of_dfs):
        
        # plot outcome variable
        #plot_outcome(df)

        # plot correlations
        #plot_correlations(df)

        # create splits
        X_train, X_val, X_test, y_train, y_val, y_test = create_splits(df)

        # transform X
        X_train = transform_X(X_train)
        X_val = transform_X(X_val)
        X_test = transform_X(X_test)
        
        model_performances = []

        # run null model
        evaluate(model=None, model_name='dummy', constant_value=y_train.mean())

        # run plain linear regression
        reg = LinearRegression().fit(X_train, y_train)
        evaluate(model=reg, model_name='linear')

        # run ridge and lasso versions
        models = {} 
        models['linear-0.0'] = reg
        for alpha in [0.01, 0.1, 0.2, 0.5, 1.0, 20.0, 10.0, 100.0, 1000.0]:
            for est in [Lasso, Ridge]:
                if est == Lasso:
                    id = 'lasso'
                else:
                    id = 'ridge'
                reg = est(alpha=alpha).fit(X_train, y_train)
                models[f'{id}-{alpha}'] = reg
                evaluate(model=reg, model_name=f'{id}-alpha-{alpha}')

        print(model_performances)


In [31]:
loop_through_dfs(dataframes)

[{'model': 'dummy', 'split': 'train', 'rmse': 11.2661, 'r2': 0.0}, {'model': 'dummy', 'split': 'val', 'rmse': 12.044, 'r2': -0.0241}, {'model': 'dummy', 'split': 'test', 'rmse': 12.5069, 'r2': -0.0051}, {'model': 'linear', 'split': 'train', 'rmse': 6.9475, 'r2': 0.6197}, {'model': 'linear', 'split': 'val', 'rmse': 55978618892232.44, 'r2': -2.2122366196762484e+25}, {'model': 'linear', 'split': 'test', 'rmse': 45297602434711.734, 'r2': -1.3183983522923074e+25}, {'model': 'lasso-alpha-0.01', 'split': 'train', 'rmse': 7.1045, 'r2': 0.6023}, {'model': 'lasso-alpha-0.01', 'split': 'val', 'rmse': 12.3959, 'r2': -0.0848}, {'model': 'lasso-alpha-0.01', 'split': 'test', 'rmse': 11.7864, 'r2': 0.1074}, {'model': 'ridge-alpha-0.01', 'split': 'train', 'rmse': 6.9584, 'r2': 0.6185}, {'model': 'ridge-alpha-0.01', 'split': 'val', 'rmse': 57.0004, 'r2': -21.9374}, {'model': 'ridge-alpha-0.01', 'split': 'test', 'rmse': 44.7042, 'r2': -11.8408}, {'model': 'lasso-alpha-0.1', 'split': 'train', 'rmse': 7.53

  model = cd_fast.enet_coordinate_descent(


[{'model': 'dummy', 'split': 'train', 'rmse': 6.9849, 'r2': 0.0}, {'model': 'dummy', 'split': 'val', 'rmse': 6.9448, 'r2': -0.0068}, {'model': 'dummy', 'split': 'test', 'rmse': 7.2624, 'r2': -0.0001}, {'model': 'linear', 'split': 'train', 'rmse': 5.3493, 'r2': 0.4135}, {'model': 'linear', 'split': 'val', 'rmse': 9541660576076.375, 'r2': -1.9004629985668793e+24}, {'model': 'linear', 'split': 'test', 'rmse': 7721061288113.634, 'r2': -1.1303945051914359e+24}, {'model': 'lasso-alpha-0.01', 'split': 'train', 'rmse': 5.4256, 'r2': 0.3966}, {'model': 'lasso-alpha-0.01', 'split': 'val', 'rmse': 6.9139, 'r2': 0.0022}, {'model': 'lasso-alpha-0.01', 'split': 'test', 'rmse': 8.1152, 'r2': -0.2487}, {'model': 'ridge-alpha-0.01', 'split': 'train', 'rmse': 5.3523, 'r2': 0.4128}, {'model': 'ridge-alpha-0.01', 'split': 'val', 'rmse': 31.3533, 'r2': -19.52}, {'model': 'ridge-alpha-0.01', 'split': 'test', 'rmse': 25.9501, 'r2': -11.7689}, {'model': 'lasso-alpha-0.1', 'split': 'train', 'rmse': 5.8064, 'r2

  model = cd_fast.enet_coordinate_descent(


[{'model': 'dummy', 'split': 'train', 'rmse': 12.0773, 'r2': 0.0}, {'model': 'dummy', 'split': 'val', 'rmse': 12.2307, 'r2': -0.0065}, {'model': 'dummy', 'split': 'test', 'rmse': 12.6877, 'r2': -0.0034}, {'model': 'linear', 'split': 'train', 'rmse': 9.5852, 'r2': 0.3701}, {'model': 'linear', 'split': 'val', 'rmse': 102007974345611.89, 'r2': -7.001648449968539e+25}, {'model': 'linear', 'split': 'test', 'rmse': 82544313499795.88, 'r2': -4.247020950025981e+25}, {'model': 'lasso-alpha-0.01', 'split': 'train', 'rmse': 9.7024, 'r2': 0.3546}, {'model': 'lasso-alpha-0.01', 'split': 'val', 'rmse': 15.2863, 'r2': -0.5723}, {'model': 'lasso-alpha-0.01', 'split': 'test', 'rmse': 16.1432, 'r2': -0.6244}, {'model': 'ridge-alpha-0.01', 'split': 'train', 'rmse': 9.6137, 'r2': 0.3664}, {'model': 'ridge-alpha-0.01', 'split': 'val', 'rmse': 51.0289, 'r2': -16.5212}, {'model': 'ridge-alpha-0.01', 'split': 'test', 'rmse': 73.2512, 'r2': -32.4456}, {'model': 'lasso-alpha-0.1', 'split': 'train', 'rmse': 10.1