In [1]:
#Importing all the needed packages and subpackages. 
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 14 09:19:54 2022

@author: vivianliu
adopted for jupyter notebook by Alexa Halford
"""
#from dask.distributed import Client

#client = Client(n_workers=2, threads_per_worker=2, memory_limit="1GB")
#client

#import dask.dataframe as dd

import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import copy
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score



In [4]:
# Here we are putting in the information that needs to be hardcoded into 
# the jupyter notbook runs. 

#   data: a data frame or string pathname type
data = "D:\\data\\VL_sdrag\\combined_data_all_reduced_omni.csv"

#   target_variable: string with target variable name
target_variable = '400kmDensity'

#Below is our "Standard model" for what we want to compare to. 
#   features (optional): takes a list type with the names of all the 
#                        variables to include. Default is all
Stand_features = ["DAILY_F10.7_", "MagTime","SLat", "SYM/H_INDEX_nT"] #"DipLat", 
                  #"SYM/H_INDEX_nT"]#, "1-M_AE_nT", "3-H_KP*10_"]

#need to add in variations of sort of latitude, magnetic lat, geographic lat, solar zenith angle, etc

#There are some features we want for plotting - like date - 
#but don't want for creating the model so we identify those here. 
drop_features = ["year", "hour", "minute", "second"] #None


#   estimators (optional): integer for number of estimators in 
#                       random forest. Default is 150
estimators = 150


#   TrainUpTo (optional): A 4 digit year input that allows users to 
#                       select training data before inputted year
TrainUpTo = 2008 #None


#   rdm_state (optional): integer for random state of random 
#                        forest regression. Defult is 16
rdm_state = 16


#   test_portion (optional): float between 0 and 1 for 
#                         proportion of data used for test. 
#                         Default is 0.25
test_portion = 0.25



In [5]:
#Further preparing the data 

#Create a dataframe based on data input method
#if (type(data) == pd.core.frame.DataFrame):
#    merged_df = data;
#elif (type(data) == str):
#    merged_df = pd.read_csv(data)

merged_df = pd.read_csv(data)
merged_df = merged_df.sample(100000)
a = np.array(merged_df.keys())
#merged_df = dd.read_csv(data)
    
#Sort by data for easier reading
merged_df = merged_df.sort_values(by = "Datetime")
merged_df = merged_df.reset_index(drop = True)
    
#Get rid of any rows outside of expected date range
#This needed to be done as there were some bad date and data
merged_df = merged_df[~(merged_df["Datetime"] < '2002-05-01')]

#Remove datetime column for random forest
merged_df = merged_df.drop("Datetime", axis = 1)

#Set target and feature variables
target = merged_df[target_variable]
target = target*(10**12)
merged_df = merged_df.drop(target_variable, axis = 1)


In [6]:
merged_df

Unnamed: 0,SYM/H_INDEX_nT,1-M_AE_nT,Sec,CLat,SLat,SLon,Height,STime,DipLat,MagLon,...,wavelength (nm),irradiance (W/m^2/nm),uncertainty (unitless),Total mass density,year,month,day,hour,minute,second
0,-2.0,30.0,42.5,36.0,37.10206,-153.72723,481.705,13.7597,36.32243,-89.71306,...,121.45,0.006988,0.048970,2.115867e-10,2002,8,1,0,1,0
1,-2.0,26.0,42.5,36.0,37.10206,-153.72723,481.705,13.7597,36.32243,-89.71306,...,121.45,0.006988,0.048970,2.115340e-10,2002,8,1,0,2,0
2,-2.0,27.0,162.5,30.0,29.42465,-154.04183,481.527,13.7721,28.98418,-88.12000,...,121.45,0.006988,0.048970,2.114817e-10,2002,8,1,0,3,0
3,-2.0,27.0,215.0,27.0,26.06354,-154.18842,481.640,13.7769,25.79167,-87.49657,...,121.45,0.006988,0.048970,2.114271e-10,2002,8,1,0,4,0
4,-2.0,25.0,265.0,24.0,22.86150,-154.33181,481.868,13.7812,22.75066,-86.94135,...,121.45,0.006988,0.048970,2.113748e-10,2002,8,1,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4670936,-3.0,430.0,86135.0,84.0,85.04518,104.21998,448.917,6.8702,79.01713,174.31816,...,121.45,0.006015,0.050021,2.282469e-10,2012,6,30,23,56,0
4670937,-2.0,453.0,86197.5,81.0,81.08478,109.07204,448.599,7.2111,75.37923,178.91205,...,121.45,0.006015,0.050021,2.281787e-10,2012,6,30,23,57,0
4670938,-2.0,457.0,86272.5,75.0,76.29726,110.99326,448.123,7.3600,70.88820,-177.99620,...,121.45,0.006015,0.050021,2.281102e-10,2012,6,30,23,58,0
4670939,-3.0,454.0,86340.0,72.0,71.97806,111.71902,447.619,7.4271,66.75584,-176.33339,...,121.45,0.006015,0.050021,2.280416e-10,2012,6,30,23,59,0


In [None]:
#Here we are starting to build up the different models by making 
#sure they have the right inputs/outputs. 
#Adjust features being used based on user input

#First we start with the 'standard' model
#Stand_list = list(merged_df.columns)
STmodel_features = ["year", "month", "day", "hour", "minute", "second"]
for element in Stand_features:
    STmodel_features.append(element)
Stand_list = STmodel_features
merged_df = merged_df[Stand_list]
a = merged_df.keys()
for i in range(len(Stand_list)):
    if a[i] in Stand_list:
        print(a[i], 'is being kept')
    else:
        del Stand_list[a[i]]


In [None]:
###Set training and testing groups###
    
#If no testing set specified, create random testing and training groups
if (TrainUpTo == None):
    train_features, test_features, train_target, test_target = train_test_split(merged_df, target, test_size = test_portion, random_state = rdm_state)
else:
    #Choose training and testing data
    train_features = merged_df[merged_df.year <= TrainUpTo]
    size = len(train_features.index)
    test_features = merged_df.iloc[size:]
    train_target = target.iloc[0:size]
    test_target = target.iloc[size:]
    


In [None]:
#Create new variables to hold current dataframe which has datetime values in case they are dropped
#Used for graphing predicted and true values based on date
graph_df = merged_df
Stand_list_u = Stand_list.copy()
test_features_u = test_features


In [None]:
#Drop features that user specifies so that they aren't included in the random forest
if (drop_features != None):
    train_features = train_features.drop(drop_features, axis = 1)
    test_features = test_features.drop(drop_features, axis = 1)   
    for element in drop_features:
        Stand_list.remove(element)


In [None]:
#Train and fit the model
rf = RandomForestRegressor(n_estimators = estimators, random_state = rdm_state)
rf.fit(train_features, train_target)


In [None]:

#Make predictions and calculate error
predictions = rf.predict(test_features)

#Print the mean absolute error
mean_abs_error = mean_absolute_error(test_target, predictions)
print("\nMean Absolute Error: ", mean_abs_error, " kg/m^3.")

#Print mean absolute percentage error
mape = mean_absolute_percentage_error(test_target, predictions)
print("Mean Absolute Percentage Error: ", mape)


In [None]:
#Print r-squared score of model
score = r2_score(test_target, predictions)
#print("Score: ", score)

#Examine feature importances
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(Stand_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

print('For the standard model')
print("Score: ", score)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
print()
print()



In [None]:

#Create arrays for the datetime values for the true data
months = graph_df.iloc[:, Stand_list_u.index('month')]
days = graph_df.iloc[:, Stand_list_u.index('day')]
years = graph_df.iloc[:, Stand_list_u.index('year')]
hours = graph_df.iloc[:, Stand_list_u.index('hour')]
minutes = graph_df.iloc[:, Stand_list_u.index('minute')]
seconds = graph_df.iloc[:, Stand_list_u.index('second')]
    
#Convert datetime arrays to datetime type
dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) + " " + str(int(hour)) + ":" + str(int(minute)) + ":" + str(int(second)) for year, month, day, hour, minute, second in zip(years, months, days, hours, minutes, seconds)]
dates = [dt.datetime.strptime(date, '%Y-%m-%d %H:%M:%S') for date in dates]
    
#Create dataframe using datetime and target data
true_data = pd.DataFrame(data = {'date': dates, 'actual': target})
true_data = true_data.sort_values(by = "date")
true_data["actual"] = true_data["actual"] / (10**12)
    
#Create arrays for the datetime values for the predicted data
months = test_features_u.iloc[:, Stand_list_u.index('month')]
days = test_features_u.iloc[:, Stand_list_u.index('day')]
years = test_features_u.iloc[:, Stand_list_u.index('year')]
hours = test_features_u.iloc[:, Stand_list_u.index('hour')]
minutes = test_features_u.iloc[:, Stand_list_u.index('minute')]
seconds = test_features_u.iloc[:, Stand_list_u.index('second')]
    
#Convert datetime arrays to datetime type
test_dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) + " " + str(int(hour)) + ":" + str(int(minute)) + ":" + str(int(second)) for year, month, day, hour, minute, second in zip(years, months, days, hours, minutes, seconds)]
test_dates = [dt.datetime.strptime(date, '%Y-%m-%d %H:%M:%S') for date in test_dates]
    
#Make a new dataframe with prediction data
prediction_data = pd.DataFrame(data = {"dates": test_dates, "predictions": predictions})
prediction_data = prediction_data.sort_values(by = "dates")
prediction_data["predictions"] = prediction_data["predictions"] / (10**12)
    
#Plot the true values in blue
plt.plot(true_data["date"], true_data["actual"], "b-", label = "actual")
#Plot predicted values in magenta
plt.plot(prediction_data["dates"], prediction_data["predictions"], "mo", label = "predicted", markersize = 3)
    
    
#Label Plot
plt.xticks(rotation = 60)
plt.xlabel("Date")
plt.ylabel("400 km Density")
plt.title("Actual and Predicted Values of\nRandom Forest for 400km Density")
plt.legend()