In [None]:
# Melia to insert code here
def run_logregression_experiment(df_depression=df_depression,
                                  n_features=20,
                                  c=1.0,
                                  penalty='l2',
                                  solver='saga'):
    
    """
    Runs logistic regression with cross-validation. Returns a dataframe with average training accuracies, average validation accuracies, and average variance across folds

    Params:
        df_depression: dataframe with depression data
        n_features: number of features to use
        c: inverse of regularization strength
        penalty: type of regularization
        solver: algorithm to use in the optimization problem

    Returns:
        df_results: dataframe with results of experiment
    """
    
    # Print number of features used
    print(f"\nRunning experiment with {n_features} features, c={c}, penalty={penalty}, solver={solver}")
    print("-----------------------------------------------------------------------------------------")

    # Calculate column index for last feature
    last_feature_index = n_features + 1
    
    # Split the data into X and y
    X = np.array(df_depression.iloc[:, 1:last_feature_index])
    X_std = StandardScaler().fit_transform(X)
    y = np.array(df_depression.iloc[:, 0])

    # Define the hyperparameter grid to search
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'max_iter': [100, 500, 1000]
    }

    # Create the logistic regression model
    model = LogisticRegression(max_iter=2000)

    # define k-fold cross-validation experimentation
    k_folds = 5
    cv = KFold(n_splits=k_folds, shuffle=True, random_state=1)

    # Perform the grid search
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_std, y)

    # Get the best hyperparameters and the corresponding mean test score
    best_params = grid_search.best_params_
    best_mean_test_score = grid_search.best_score_

    # Lists to store training and validation scores for each fold
    train_accuracies = []
    validation_accuracies = []

    # Perform cross-validation and get training and validation scores for each fold
    for train_index, validation_index in cv.split(X_std, y):
        X_train, X_val = X_std[train_index], X_std[validation_index]
        y_train, y_val = y[train_index], y[validation_index]

        # Train the model on the training set within each fold
        model.fit(X_train, y_train)

        # Calculate training and validation accuracies
        train_accuracy = model.score(X_train, y_train)
        validation_accuracy = model.score(X_val, y_val)

        # Append the accuracies to the lists
        train_accuracies.append(train_accuracy)
        validation_accuracies.append(validation_accuracy)

        # Print the accuracy for the current fold
        print(f"Fold {len(train_accuracies)} - Training Accuracy: {train_accuracy:.4f}, Validation Accuracy: {validation_accuracy:.4f}")

    # Calculate average training and validation accuracies
    average_training_accuracy = np.mean(train_accuracies)
    average_validation_accuracy = np.mean(validation_accuracies)
    average_variance = average_training_accuracy - average_validation_accuracy 

    # Print the overall statistics
    print("Average Training Accuracy:", average_training_accuracy)
    print("Average Validation Accuracy:", average_validation_accuracy)

    # Create a dataframe containing results of the experiment
    df_results = pd.DataFrame({
        'n_features': [n_features],
        'c': [c],
        'penalty': [penalty],
        'solver': [solver],
        'average_training_accuracy': [average_training_accuracy],
        'average_validation_accuracy': [average_validation_accuracy],
        'average_variance': [average_variance]
    })

    return df_results

# Call the function to perform the grid search
df_results = run_logregression_experiment(df_depression, n_features=20)

# Display the complete DataFrame of results
print(df_results)

# Find the row that has the lowest variance and highest validation accuracy
best_row = df_results.loc[df_results['average_variance'].idxmin()]

print("\nBest Hyperparameters:")
print(best_row)


In [None]:
def grid_search_adaboost(X_std, y):
    # Perform algorithm on no split data with cross validation
    X, y = X_std, y
    base = DecisionTreeClassifier(max_depth=1,criterion='entropy')
    model = AdaBoostClassifier(estimator=base)

    # Define the grid of values to search
    grid = dict()
    grid['n_estimators'] = [700,800,710,730,750,780,770, 778,783]
    grid['learning_rate'] = [1.7,1.8,1.9,1.75]

    # Define the evaluation procedure
    k_folds = 5
    cv = KFold(n_splits=k_folds, shuffle=True, random_state=1)

    # Define the grid search procedure
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',return_train_score=True)

    # Execute the grid search
    grid_result = grid_search.fit(X, y)

    # Summarize the best score and configuration
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

    # # Summarize all scores that were evaluated
    # means = grid_result.cv_results_['mean_test_score']
    # stds = grid_result.cv_results_['std_test_score']
    # params = grid_result.cv_results_['params']
    # for mean, stdev, param in zip(means, stds, params):
    #     print("%f (%f) with: %r" % (mean, stdev, param))

    # Convert cv_results_ to a pandas DataFrame
    cv_results_df = pd.DataFrame(grid_search.cv_results_)

    # Remove rows from cv_results_ that have NaN mean_test_score
    cv_results_df = cv_results_df[cv_results_df['mean_test_score'].notna()]

    # Calculate mean variance (difference between test and train) for each hyperparameter combination
    cv_results_df['mean_variance'] = cv_results_df['mean_train_score'] - cv_results_df['mean_test_score']

    # Keep only the columns of interest
    cv_results_df = cv_results_df[['param_n_estimators', 'param_learning_rate', 'mean_train_score', 'std_train_score', 'mean_test_score', 'std_test_score', 'mean_variance']]

    # Sort cv_results_ by mean_test_score and mean_variance(inverted) and reindex
    cv_results_df = cv_results_df.sort_values(by=['mean_test_score', 'mean_variance'], ascending=[False, True])
    cv_results_df = cv_results_df.reset_index(drop=True)

    return cv_results_df, grid_result.best_params_

df_gridsearch_adaboost, adaboost_bestparams = grid_search_adaboost(X_std, y)