## Main Notebook
#### Use separate notebook when working to avoid merge conflicts!!!

In [1]:
import pandas as pd 
import numpy as np
import math
import scipy

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression

from sklearn.neural_network import MLPClassifier
from concurrent.futures import ThreadPoolExecutor
import pickle as pickle
import csv as csv

from sklearn import svm
# import some validation tools
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from warnings import filterwarnings

reg_class_file = "/home/bwright1/scratch/weatherAUS_regression_and_classification_imputation.csv"
reg_mean_mode_file = "/home/bwright1/scratch/weatherAUS_regression_and_mean_mode.csv"
just_mean_mode_file = "/home/bwright1/scratch/weatherAUS_mean_mode_imputed.csv"
no_nans_file = "/scratch/bwright1/weatherAUS_continous_normalized_no_nans.csv"
full_file = "/scratch/bwright1/weatherAUS.csv"

## PipeLine Functions

In [2]:
def normalize_column(X):
    """
    You will get overflow problems when calculating exponentials if 
    your feature values are too large.  This function adjusts all values to be
    in the range of 0 to 1 for each column.
    """         
    X = X - X.min() # shift range to start at 0
    normalizedX = X/X.max() # divide by possible range of values so max is now 1
    return normalizedX

def normalize_data(X):
    columns = X.columns
    new = []
    for column in columns:
        # if column in ['WindGustDir', 'WindDir3pm', 'WindDir9am', 'RainToday']:
        #     new.append(X[column])
        #     continue
        new.append(normalize_column(X[column]))
    return pd.DataFrame(new).transpose()

In [3]:
def convert_discrete_to_continous_column(column):
    filterwarnings('ignore')
    unique_values = column.unique()
    print(unique_values)
    num_uniques = len(unique_values)
    print(len(column))
    for i in range(num_uniques):
        indices = np.where(column == unique_values[i])
        column.loc[indices] = i

def convert_discrete_to_continous_column_individualized(column, feature):
    filterwarnings('ignore')
    if (feature == 'WindGustDir') or (feature == 'WindDir3pm') or (feature == 'WindDir9am'):
        directions = ['N', 'NNE', 'NE', 'ENE', 'E', 'ESE', 'SE', 'SSE', 'S', 'SSW', 'SW', 'WSW', 'W', 'WNW', 'NW', 'NNW']
        value = 0
        for direction in directions:
            indices = np.where(column == direction)
            column.loc[indices] = value
            value += 1
    elif (feature == 'RainToday'):
        labels = ['No', 'Yes']
        value = 0
        for label in labels:
            indices = np.where(column == label)
            column.loc[indices] = value
            value += 1
    else:
        raise KeyError("Invalid Key")

def convert_discrete_rain_columns(X, individualized=False):
    discrete_columns = ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
    #discrete_columns = ['WindGustDir']
    for column in discrete_columns:
        if individualized:
            convert_discrete_to_continous_column_individualized(X[column], column)
        else: 
            convert_discrete_to_continous_column(X[column])
        

In [1]:
def runTuneTest_multithreaded(learner, parameters, X, y, continuous=False):
    """
    Uses Stratified K Fold with 5 splits on an Exhaustive Grid Search to tune
        hyperparameters on given learner. Finds best hyperparameters and score 
        for each fold
    Params:
        learner (SKLearn Model): The learner model to be evaluated
        parameters (dict): The hyperparameters to tune
        X (data): The feature values of the dataset to train/test on
        y (data): The label values of the dataset to train/test on
    Returns:
        scores (list): A list of the best score for each fold
    """
    splits = 5
    #print("\t In runTuneTest")
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=0)
    if continuous:
        skf = RepeatedKFold(n_splits=3, n_repeats=2, random_state=0)
    #print(len(skf.split(X,y)))
    
    scores = []
    with ThreadPoolExecutor(max_workers=splits) as executor:
        i=1
        futures = []
        for train, test in skf.split(X, y):
            futures.append(executor.submit(__do_single_split, train, test, i, X, y, learner, parameters))
            i+=1

        for future in futures:
            j, score, best_params = future.result()
            scores.append(score)
            print(f"\tFold {j}:\n\tBest parameters: {best_params}\n\tTuning Set Score: {score}\n")
        
    return scores

def runTuneTest_singlethread(learner, parameters, X, y, continuous=False):
    """
    Uses Stratified K Fold with 5 splits on an Exhaustive Grid Search to tune
        hyperparameters on given learner. Finds best hyperparameters and score 
        for each fold
    Params:
        learner (SKLearn Model): The learner model to be evaluated
        parameters (dict): The hyperparameters to tune
        X (data): The feature values of the dataset to train/test on
        y (data): The label values of the dataset to train/test on
    Returns:
        scores (list): A list of the best score for each fold
    """
    splits = 5
    #print("\t In runTuneTest")
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=0)
    if continuous:
        skf = RepeatedKFold(n_splits=3, n_repeats=2, random_state=0)
    #print(len(skf.split(X,y)))
    
    scores = []
    models = []
    i = 1
    for train, test in skf.split(X, y):
        j, score, best_params = __do_single_split(train, test, i, X, y, learner, parameters)
        scores.append(score)
        # models.append(model)
        print(f"\tFold {i}:\n\tBest parameters: {best_params}\n\tTuning Set Score: {score['accuracy']}\n")
        i+=1
        
    return scores

def __do_single_split(train, test, i, X, y, learner, parameters):
    """
    Helper Function for RunTuneTest. Allows for easy parallelization of stratified folds
    """
    print(f"Executing fold {i}")
    clf = GridSearchCV(learner, parameters, cv=3)
    #print("did_grid_search")
    trainX = X.iloc[train]
    trainY = y.iloc[train]
    clf.fit(trainX, trainY)
    #print("did_fit")
    testX = X.iloc[test]
    testY = y.iloc[test]
    score = clf.score(testX, testY)
    y_predicted = clf.predict(testX)
    report = classification_report(testY, y_predicted, output_dict=True)
    # print((report))
    scores = {"accuracy": score, "no_precision": report['No']['precision'], "no_recall": report['No']['recall'], "yes_precision": report['Yes']['precision'], "yes_recall": report['Yes']['recall']}
    best_params = clf.best_params_
    return i, scores, best_params
    

In [2]:
len(data.columns) throwing an error

In [3]:
def runPipeline(X, y, identifier=""):
    """
    PipeLine function that identifies the best parameters for each model.
        Prints the accuracy scores for each model, across 5 Stratified K Folds. 
        Runs pipeline for a Random Forest, K Nearest Neighbors, Decision Tree, and Stochastic Gradient Descent Classifiers
    Params:
        X (pd.Dataframe) : Examples to train/test on
        y (pd.Dataframe) : Example labels to train/test on
        identifier (str) : Optional name of data
    Returns:
        dictionary of accuracy scores for each model
    """

    print("Running pipeline for", f"'{identifier}'")
    
    rf_classifier = RandomForestClassifier(n_estimators=100)  # Fewer trees
    rf_parameters = {
        'n_estimators': [50, 100], #'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20], #'max_depth': [None, 10, 20, 50],
        'min_samples_split': [2, 10, 50], #'min_samples_split': [2, 5, 10],
        }
    rf_results = runTuneTest_multithreaded(rf_classifier, rf_parameters, X, y)

    knn_classifier = KNeighborsClassifier()
    knn_parameters = {
        'n_neighbors': [5, 10, 20], #'n_neighbors': [3, 5, 10, 20],
        'weights': ['uniform', 'distance'],
        }
    knn_results = runTuneTest_multithreaded(knn_classifier, knn_parameters, X, y)

    dt_classifier = DecisionTreeClassifier()
    dt_parameters = {
        'max_depth': [None, 10, 20], #'max_depth': [None, 5, 10, 15],
        'min_samples_split': [2, 10, 50],
        'criterion': ['gini', 'entropy'],
        }
    dt_results = runTuneTest_multithreaded(dt_classifier, dt_parameters, X, y)

    sgd_classifier = SGDClassifier(loss='log_loss', max_iter=50, tol=None, penalty=None, eta0=0.1)
    sgd_parameters = {
        'loss': ['hinge', 'log_loss'],
        'penalty': ['l2', 'elasticnet'],
        'alpha': [1e-4, 1e-2],
        }
    sgd_results = runTuneTest_multithreaded(sgd_classifier, sgd_parameters, X, y)

    final_res = [rf_results, knn_results, dt_results, sgd_results] # , mlp_results]
    classifiers = ["RandomForest", "KNN", "DecisionTree", "SGD"] # , "MLP"]

    values = dict(zip(classifiers, final_res))

    print(f"Identifier: {identifier}")
    for name, results in zip(classifiers, final_res):
        for fold, acc in enumerate(results, 1):
            print(f"{name}, Fold {fold}: {acc['accuracy'] * 100:.2f}%")

    return values

In [4]:
def new_runPipeline(data, identifier=""):
    return runPipeline(data.drop(columns=['RainTomorrow']), data['RainTomorrow'], identifier=identifier)

In [5]:
scores = runPipeline(data.drop(columns=['RainTomorrow', 'Location', 'Date']), data['RainTomorrow'])

NameError: name 'data' is not defined

In [None]:
scores

# Running pipeline on all .csv datasets

In [None]:
imputations = ['remove', 'reg-class', 'mean-mode', 'hybrid']
variations = ['drop-loc+date', 'drop-loc-(month-disc)', 'drop-loc-(month-circ)', 'drop-date-(lat-long)', 
              'disc-month-lat-long', 'disc-month+loc', 'month-circ-lat-long', 'month-circ-lat-long-wind-circ', 
              'split-loc-(drop-date)', 'split-season-(drop-loc)', 'split-loc+season',
              'split-loc-(month-disc)', 'split-season-(month-circ)', 'split-season-(lat-long)',
              'rm-temp1', 'rm-temp2', 'rm-temp3', 'rm-temp4', 'rm-temp5', 'rm-temp6', 'rm-pres1', 'rm-pres2', 
              'rm-hum1', 'rm-hum2', 'rm-1', 'rm-2', 'rm-3', 'rm-4']

In [None]:
imp = "hybrid"
var = "split-season-(drop-loc)"
data = pd.read_csv(f'/scratch/srebarb1/NewCSVs/{imp}_{var}.csv')

In [None]:
imputations = ["hybrid"]
variations = ["disc-month+loc"]

In [None]:
import time

In [2]:
def run_all(imputations, variations):
    start = time.time()
    dictionary = {}
    for imp in imputations:
        dictionary[imp] = {}
        for var in variations:
            data = pd.read_csv(f'/scratch/srebarb1/NewCSVs/{imp}_{var}.csv')
            display(data)
            if 'split-loc' in var:
                new_dict = {}
                locs = data.Location.unique()
                for loc in locs:
                    new_data = data[data.Location == loc]
                    new_dict[loc] = new_runPipeline(new_data, identifier=f'{imp}_{var}_{loc}')
            elif 'split-season' in var:
                new_dict = {}
                seasons = data.Season.unique()
                for season in seasons:
                    new_data = data[data.Season == season]
                    new_dict[season] = new_runPipeline(new_data, identifier=f'{imp}_{var}_{season}')
            elif 'split-loc+season' in var:
                new_dict = {}
                locs = data.Location.unique()
                seasons = data.Season.unique()
                for loc in locs:
                    new_dict[loc] = {}
                    for season in seasons:
                        new_data = data[(data.Location == loc) and (data.Season == season)]
                        new_dict[loc][season] = new_runPipeline(new_data, identifier=f'{imp}_{var}_{loc}_{season}')
            else:
                new_dict = new_runPipeline(data, identifier=f'{imp}_{var}')
    
            with open(f'/scratch/srebarb1/MLproject_dictionaries/{imp}_{var}.pkl', "wb") as f:
                pickle.dump(new_dict, f)
            dictionary[imp][var] = new_dict
    
    #with open(f'/scratch/srebarb1/MLproject_dictionaries/all.pkl', "wb") as f:
    #    pickle.dump(dictionary, f)
    #end = time.time()
    return dictionary

In [None]:
start = time.time()
dictionary = {}
for imp in imputations:
    dictionary[imp] = {}
    for var in variations:
        data = pd.read_csv(f'/scratch/srebarb1/NewCSVs/{imp}_{var}.csv')
        display(data)
        if 'split-loc' in var:
            new_dict = {}
            locs = data.Location.unique()
            for loc in locs:
                new_data = data[data.Location == loc]
                new_dict[loc] = new_runPipeline(new_data, identifier=f'{imp}_{var}_{loc}')
        elif 'split-season' in var:
            new_dict = {}
            seasons = data.Season.unique()
            for season in seasons:
                new_data = data[data.Season == season]
                new_dict[season] = new_runPipeline(new_data, identifier=f'{imp}_{var}_{season}')
        elif 'split-loc+season' in var:
            new_dict = {}
            locs = data.Location.unique()
            seasons = data.Season.unique()
            for loc in locs:
                new_dict[loc] = {}
                for season in seasons:
                    new_data = data[(data.Location == loc) and (data.Season == season)]
                    new_dict[loc][season] = new_runPipeline(new_data, identifier=f'{imp}_{var}_{loc}_{season}')
        else:
            new_dict = new_runPipeline(data, identifier=f'{imp}_{var}')

        with open(f'/scratch/srebarb1/MLproject_dictionaries/{imp}_{var}.pkl', "wb") as f:
            pickle.dump(new_dict, f)
        dictionary[imp][var] = new_dict

with open(f'/scratch/srebarb1/MLproject_dictionaries/all.pkl', "wb") as f:
    pickle.dump(dictionary, f)
end = time.time()

In [None]:
with open(f'/scratch/srebarb1/MLproject_dictionaries/{imp}_{var}.pkl', 'rb') as file:
    practice = pickle.load(file)

In [None]:
with open(f'/scratch/srebarb1/MLproject_dictionaries/{imp}_{var}.pkl', "wb") as f:
    pickle.dump(new_dict, f)

In [None]:
practice['KNN']

In [None]:
dictionary[imp][var] = new_dict

In [None]:
new_dict[0.0]['RandomForest']

In [None]:
dictionary['hybrid']['split-season'].keys()

In [None]:
practice.keys()

In [2]:
with open('/scratch/srebarb1/MLproject_dictionaries/hybrid_rm-all-but-rain+temp.pkl', 'rb') as file:
    practice = pickle.load(file)

In [9]:
file = "/scratch/srebarb1/NewNewCSVs/reg-class_disc-month-lat-long.csv"
test = pd.read_csv(file)

In [10]:
test

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Month,Latitude,Longitude
0,0.516509,0.523629,0.001617,0.063141,0.596345,0.666667,0.294574,0.666667,0.200000,0.153846,...,0.22,0.449587,0.4800,0.508439,0.522073,0.0,No,12,-36.0802,146.9137
1,0.375000,0.565217,0.000000,0.073556,0.795448,0.200000,0.294574,0.800000,0.733333,0.030769,...,0.25,0.497521,0.4912,0.514768,0.570058,0.0,No,12,-36.0802,146.9137
2,0.504717,0.576560,0.000000,0.095832,0.806069,0.733333,0.310078,0.666667,0.733333,0.146154,...,0.30,0.447934,0.5056,0.594937,0.548944,0.0,No,12,-36.0802,146.9137
3,0.417453,0.620038,0.000000,0.076483,0.780690,0.466667,0.139535,0.333333,0.533333,0.084615,...,0.16,0.613223,0.5712,0.533755,0.612284,0.0,No,12,-36.0802,146.9137
4,0.613208,0.701323,0.002695,0.081881,0.553655,0.666667,0.271318,0.400000,1.000000,0.053846,...,0.33,0.500826,0.4624,0.527426,0.673704,0.0,No,12,-36.0802,146.9137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,0.283019,0.502836,0.000000,0.062265,0.796828,0.533333,0.193798,0.866667,0.533333,0.115385,...,0.27,0.730579,0.7056,0.350211,0.504798,0.0,No,6,-25.3444,131.0369
142189,0.266509,0.533081,0.000000,0.066053,0.783379,0.533333,0.193798,0.333333,0.400000,0.100000,...,0.24,0.728926,0.6912,0.364979,0.533589,0.0,No,6,-25.3444,131.0369
142190,0.285377,0.568998,0.000000,0.070554,0.796690,0.800000,0.124031,0.333333,0.266667,0.100000,...,0.21,0.710744,0.6720,0.381857,0.573896,0.0,No,6,-25.3444,131.0369
142191,0.327830,0.599244,0.000000,0.073861,0.820138,0.266667,0.240310,0.333333,0.200000,0.069231,...,0.24,0.669421,0.6352,0.415612,0.604607,0.0,No,6,-25.3444,131.0369


In [3]:
practice

{'RandomForest': [{'accuracy': 0.799008403952319,
   'no_precision': 0.8161844344731549,
   'no_recall': 0.956308919506889,
   'yes_precision': 0.6273676072671047,
   'yes_recall': 0.25458823529411767},
  {'accuracy': 0.8006962270122016,
   'no_precision': 0.817942054842338,
   'no_recall': 0.955853691701038,
   'yes_precision': 0.6332831325301205,
   'yes_recall': 0.263801756587202},
  {'accuracy': 0.7991138928935616,
   'no_precision': 0.8170079106561191,
   'no_recall': 0.9549471966640982,
   'yes_precision': 0.6250471520181063,
   'yes_recall': 0.25988080301129235},
  {'accuracy': 0.8000210985301357,
   'no_precision': 0.8180546923555003,
   'no_recall': 0.9545392738974754,
   'yes_precision': 0.6276911655530809,
   'yes_recall': 0.26525490196078433},
  {'accuracy': 0.8007947113017793,
   'no_precision': 0.818630498989585,
   'no_recall': 0.9547658976567104,
   'yes_precision': 0.631189948263119,
   'yes_recall': 0.267921568627451}],
 'KNN': [{'accuracy': 0.7932065121839728,
   'no

In [4]:
import pickle
imp = 'remove'
var = 'disc-month-lat-long'
with open(f'/scratch/srebarb1/MLproject_dictionaries/{imp}_{var}.pkl', 'rb') as file:
    practice = pickle.load(file)

In [6]:
practice.keys()

dict_keys(['RandomForest', 'KNN', 'DecisionTree', 'SGD'])

In [3]:
with open(f'/scratch/srebarb1/MLproject_dictionaries/all.pkl', 'rb') as file:
    all = pickle.load(file)

In [7]:
! rm /scratch/srebarb1/MLproject_dictionaries/all.pkl

In [6]:
all['hybrid'].keys()

dict_keys(['disc-month+loc'])

In [1]:
import os
import pickle

directory = "./PklDicts"

files = os.listdir(directory)
dictionaries = {}

for file in files:
    if file.endswith(".pkl"):  
        file_path = os.path.join(directory, file)
        with open(file_path, 'rb') as f:
            dictionaries[file] = pickle.load(f)

print(f"Loaded dictionaries: {list(dictionaries.keys())}")
print("--------------------")

example_dict = dictionaries["hybrid_rm-1.pkl"] 
print(example_dict)


Loaded dictionaries: ['reg-class_split-season-(lat-long).pkl', 'hybrid_rm-pres1.pkl', 'mean-mode_month-circ-lat-long-wind-circ.pkl', 'hybrid_split-season-(lat-long).pkl', 'mean-mode_rm-hum1.pkl', 'hybrid_rm-rain.pkl', 'remove_rm-4.pkl', 'reg-class_rm-rain.pkl', 'hybrid_split-loc-(month-circ).pkl', 'reg-class_rm-pres2.pkl', 'reg-class_split-loc-(month-circ).pkl', 'remove_split-loc+season.pkl', 'mean-mode_disc-month+loc.pkl', 'reg-class_rm-4.pkl', 'mean-mode_rm-3.pkl', 'mean-mode_rm-temp2.pkl', 'hybrid_rm-4.pkl', 'reg-class_split-season-(drop-loc).pkl', 'hybrid_disc-month-lat-long.pkl', 'hybrid_rm-1.pkl', 'reg-class_rm-1.pkl', 'hybrid_split-loc-(drop-date).pkl', 'reg-class_month-circ-lat-long-wind-circ.pkl', 'mean-mode_rm-all-but-rain+temp.pkl', 'remove_rm-hum2.pkl', 'hybrid_rm-all-but-rain+temp.pkl', 'remove_rm-all-but-rain.pkl', 'reg-class_split-loc+season.pkl', 'remove_month-circ-lat-long.pkl', 'hybrid_month-circ-lat-long-wind-circ.pkl', 'mean-mode_rm-pres2.pkl', 'hybrid_drop-loc+date

In [2]:
def load_pickled_dictionaries(directory):
    files = os.listdir(directory)
    data_by_imp_var = {}

    for file in files:
        if file.endswith(".pkl"):
            file_path = os.path.join(directory, file)
            with open(file_path, 'rb') as f:
                data = pickle.load(f)
    
            base_name = file[:-4] 
            parts = base_name.split('_', 1) 
            if len(parts) == 2:
                imp, var = parts
            else:
                raise ValueError(f"Filename {file} does not follow expected pattern.")

            if imp not in data_by_imp_var:
                data_by_imp_var[imp] = {}
            data_by_imp_var[imp][var] = data

    return data_by_imp_var

def run_friedman_test(loaded_data, imp, var, model='RandomForest', metric='accuracy'):
    grouped_data = []

    for method in imp:
        if method not in loaded_data:
            print(f"Warning: '{method}' is missing in loaded_data.")
            continue
        for variation in var:
            if variation not in loaded_data[method]:
                print(f"Warning: '{variation}' is missing for '{method}' in loaded_data.")
                continue
            
            model_data = loaded_data[method][variation].get(model, [])

            metric_values = [fold_result[metric] for fold_result in model_data if metric in fold_result]

            if not metric_values:
                print(f"No metric values found for {method} - {variation} - {model}")
                continue
            
            grouped_data.append(metric_values)

    group_lengths = [len(group) for group in grouped_data]
    if len(set(group_lengths)) != 1:
        print(f"Group lengths: {group_lengths}")
        raise ValueError("All groups must have the same number of observations.")
    
    from scipy.stats import friedmanchisquare
    stat, p_value = friedmanchisquare(*grouped_data)
    print(f"Friedman test statistic: {stat}")
    print(f"p-value: {p_value}")
    
    return stat, p_value



In [3]:
directory = "./PklDicts"
imp_methods = ['reg-class', 'mean-mode', 'hybrid']
var_methods = ['drop-loc+date', 'drop-loc-(month-disc)', 'drop-loc-(month-circ)']

loaded_data = load_pickled_dictionaries(directory)

#stat, p_value = run_friedman_test(loaded_data, imp_methods, var_methods)






In [4]:
loaded_data['hybrid']['drop-loc+date'].keys()

dict_keys(['RandomForest', 'KNN', 'DecisionTree', 'SGD'])

In [8]:
print("Top-level keys in loaded_data:", loaded_data.keys())

for imp in imp_methods:
    if imp in loaded_data:
        print(f"'{imp}' exists in loaded_data.")
    else:
        print(f"Warning: '{imp}' is missing in loaded_data.")

print("Loaded file keys:", loaded_data.keys())








Top-level keys in loaded_data: dict_keys(['reg-class', 'hybrid', 'mean-mode', 'remove'])
'reg-class' exists in loaded_data.
'mean-mode' exists in loaded_data.
'hybrid' exists in loaded_data.
Loaded file keys: dict_keys(['reg-class', 'hybrid', 'mean-mode', 'remove'])
