<a href="https://colab.research.google.com/github/isam007/AssetManagementSystem_MVC/blob/master/PM3_Team6_W21.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set-Up and Data Preparation

In [None]:
import numpy as np # for numeric computation
import pandas as pd # for handling data in table format
import requests # for retrieving web addresses
import io # for storing data
import plotly.express as px # for visualization
import plotly.graph_objects as go #for time series
import matplotlib.pyplot as plt


print("Loading Data ...")
# A more robust data storage solution
p1g_data = 'https://codytischler.com/exportdata/Plant_1_Generation_Data.csv'
p1w_data = 'https://codytischler.com/exportdata/Plant_1_Weather_Sensor_Data.csv'
p2g_data = 'https://codytischler.com/exportdata/Plant_2_Generation_Data.csv'
p2w_data = 'https://codytischler.com/exportdata/Plant_2_Weather_Sensor_Data.csv'

requests.get(p1g_data)
df_p1g = pd.read_csv(p1g_data)
df_p1g.name = 'Plant 1 Power Generation Dataframe'
print(df_p1g.name+' loaded successfully')

requests.get(p1w_data)
df_p1w = pd.read_csv(p1w_data)
df_p1w.name = 'Plant 1 Weather Dataframe'
print(df_p1w.name+' loaded successfully')

requests.get(p2g_data)
df_p2g = pd.read_csv(p2g_data)
df_p2g.name = 'Plant 2 Power Generation Dataframe'
print(df_p2g.name+' loaded successfully')

requests.get(p2w_data)
df_p2w = pd.read_csv(p2w_data)
df_p2w.name = 'Plant 2 Weather Dataframe'
print(df_p2w.name+' loaded successfully')

# create a list of the data frames for easy looping
data_frames = [df_p1g, df_p2g, df_p1w, df_p2w]
print("All data has been loaded")

# correct the data types
print("Changing feature data types...")
categorical_types = ['PLANT_ID', 'SOURCE_KEY']
for df in data_frames:
  for c in categorical_types:
    df[c] = df[c].astype(pd.CategoricalDtype(categories=df[c].unique()))
c = 'DATE_TIME'
for df in data_frames:
  df[c] = pd.to_datetime(df[c], infer_datetime_format=True)
print("Feature Data Types are changed")
for df in data_frames:  
  df['MONTH_DAY'] = df['DATE_TIME'].dt.strftime('%m-%d')
  df = df[['MONTH_DAY'] + df.columns[:-1].tolist()].copy()

# Clean Plant 1 Data
print("Cleaning Plant 1 Data ...")
df_p1g_clean = df_p1g.copy()
df_p1g_clean = df_p1g_clean.set_index('DATE_TIME')
df_p1g_clean = df_p1g_clean.between_time('06:00','19:00') 
df_p1g_clean.drop(columns = ['PLANT_ID', 'AC_POWER', 'TOTAL_YIELD', 'DAILY_YIELD'], inplace=True )
df_p1g_clean = df_p1g_clean.loc[(df_p1g_clean.index.day != 26) & 
                              (df_p1g_clean.index.day != 2) & 
                              (df_p1g_clean.index.day != 3) &
                              (df_p1g_clean.index.day != 7)]
df_p1g_clean.drop(pd.Timestamp('2020-05-19 11:30:00')) # This line is fine for dropping specific timestamps
print("Plant 1 Data Cleaned")

# Clean Plant 2 Data
print("Cleaning Plant 2 Data ... ")
df_p2g_clean = df_p2g.copy()
df_p2g_clean = df_p2g_clean.set_index('DATE_TIME')
df_p2g_clean = df_p2g_clean.between_time('06:00','19:00') 
df_p2g_clean.drop(columns = ['PLANT_ID', 'AC_POWER', 'TOTAL_YIELD', 'DAILY_YIELD'], inplace=True )
## Remove 8 of 22 inverters
df_p2g_clean = df_p2g_clean[(df_p2g_clean.SOURCE_KEY != '4UPUqMRk7TRMgml') & 
             (df_p2g_clean.SOURCE_KEY != 'IQ2d7wF4YD8zU1Q') &
             (df_p2g_clean.SOURCE_KEY != 'xMbIugepa2P7lBB') &
             (df_p2g_clean.SOURCE_KEY != 'mqwcsP2rE7J0TFp') & 
             (df_p2g_clean.SOURCE_KEY != 'NgDl19wMapZy17u') &
             (df_p2g_clean.SOURCE_KEY != 'xoJJ8DcxJEcupym') &
             (df_p2g_clean.SOURCE_KEY != 'PeE6FRyGXUgsRhN') &
             (df_p2g_clean.SOURCE_KEY != 'Quc1TzYxW2pYoWX')]
print("Plant 1 Data Cleaned")


# Join Plant 1 Weather and Power Data
print("Joining Plant 1 Data ...")
## concatenate the data sets
df_p1w = df_p1w.set_index('DATE_TIME')
df_p1w = df_p1w.drop(columns = ['PLANT_ID','SOURCE_KEY'])
df_p1wg = pd.concat([df_p1w, df_p1g_clean], axis=0)
## reformat datatime
format = '%Y-%m-%d %H:%M:%S'
df_p1wg.index = pd.to_datetime(df_p1wg.index, format=format)
df_p1wg = df_p1wg.set_index(pd.DatetimeIndex(df_p1wg.index))
df_p1wg = df_p1wg.sort_index()
## interpolate weather data into power data
df_p1wg["AMBIENT_TEMPERATURE"].interpolate(method='index', inplace=True)
df_p1wg["MODULE_TEMPERATURE"].interpolate(method='index', inplace=True)
df_p1wg["IRRADIATION"].interpolate(method='index', inplace=True)
## drop Nans
df_p1wg = df_p1wg.dropna()
print("Plant 1 Data Joined")

# Join Plant 2 Weather and Power Data
print("Joining Plant 2 Data ...")
## concatenate the data sets
df_p2w = df_p2w.set_index('DATE_TIME')
df_p2w = df_p2w.drop(columns = ['PLANT_ID','SOURCE_KEY'])
df_p2wg = pd.concat([df_p2w, df_p2g_clean], axis=0)
## reformat datatime
format = '%Y-%m-%d %H:%M:%S'
df_p2wg.index = pd.to_datetime(df_p2wg.index, format=format)
df_p2wg = df_p2wg.set_index(pd.DatetimeIndex(df_p2wg.index))
df_p2wg = df_p2wg.sort_index()
## interpolate weather data into power data
df_p2wg["AMBIENT_TEMPERATURE"].interpolate(method='index', inplace=True)
df_p2wg["MODULE_TEMPERATURE"].interpolate(method='index', inplace=True)
df_p2wg["IRRADIATION"].interpolate(method='index', inplace=True)
## drop Nans
df_p2wg = df_p2wg.dropna()
print("Plant 2 Data Joined")

Loading Data ...
Plant 1 Power Generation Dataframe loaded successfully
Plant 1 Weather Dataframe loaded successfully
Plant 2 Power Generation Dataframe loaded successfully
Plant 2 Weather Dataframe loaded successfully
All data has been loaded
Changing feature data types...
Feature Data Types are changed
Cleaning Plant 1 Data ...
Plant 1 Data Cleaned
Cleaning Plant 2 Data ... 
Plant 1 Data Cleaned
Joining Plant 1 Data ...
Plant 1 Data Joined
Joining Plant 2 Data ...
Plant 2 Data Joined


# Feature Engineering

In [None]:
plant = "1"
print ("Calculating Engineered Features for Plant "+plant+" ...")
df_copy = df_p1wg.copy()
df_copy.reset_index(inplace=True)
#make a new column to hold the sum of the DC_POWER
df_copy['TOTAL_DC_POWER'] = 0.0
df_copy['OPERATING_INVERTERS'] = 0.0

# get number of rows
n_rows = df_copy.shape[0]

#create a new dataframe to save redundant calculations
df_check = pd.DataFrame(columns= ["DATE_TIME", "TOTAL_DC_POWER", "OPERATING_INVERTERS"])
last_print = -1
# loop through all rows.
for ii in range(0,n_rows):
  percent_complete = round(100*ii/n_rows)
  if (percent_complete % 10 == 0):
    if percent_complete != last_print:
      print(str(percent_complete)+"% Complete")
      last_print = percent_complete
  ii_date_time = df_copy["DATE_TIME"].loc[ii] #get the date time for the current row ii
  check_mask = (df_check["DATE_TIME"] == ii_date_time) # make a mask to see if we have already encountered this datetime before
  already_calculated = np.dot(check_mask,check_mask) #use this mask to return a value of 1 or zero that we can put in an if statement
  if (already_calculated > 0.5): # use 0.5 not 1 or zero because floating points are evil i.e 0.9999999 != 1
    # well then we have already calulated this date_time value so we just get it drom df_check and log it in
    df_copy.loc[ii,"TOTAL_DC_POWER"] = np.dot(check_mask,df_check["TOTAL_DC_POWER"])
    df_copy.loc[ii,"OPERATING_INVERTERS"] = np.dot(check_mask,df_check["OPERATING_INVERTERS"])
  else:
    # we have not calculated this before
    # get a boolean mask of all the rows which have the same date time
    mask = (df_copy["DATE_TIME"] == ii_date_time).astype('float') # This gives us all the inverters operating at this datetime which we need to sum to get the total power
    total_power = np.dot(mask,df_copy['DC_POWER']) # take the dot product of the mask with all the rows for DC_POWER which will give us the sum 
    df_copy.loc[ii,"TOTAL_DC_POWER"] = total_power # assign it to the row ii
    total_inverters = np.dot(mask,mask) # take a dot product of the mask with itselft to count all the operating inverters
    df_copy.loc[ii,"OPERATING_INVERTERS"] = total_inverters # assign it to the row ii
    #now we shall add this information to our df_check so that we can use it if we need to
    temp = pd.DataFrame([[ii_date_time, total_power, total_inverters]], columns = ["DATE_TIME", "TOTAL_DC_POWER", "OPERATING_INVERTERS"])
    df_check = df_check.append(temp)

# now each each inverter at the same date_time will have the 'TOTAL_DC_POWER'
# drop the source keys
df_copy.drop("SOURCE_KEY",axis=1, inplace=True)
df_copy.drop("DC_POWER",axis=1, inplace=True)
df_copy.drop("MONTH_DAY",axis=1, inplace=True)

# Now each date_time has many duplicates. drop all the duplicates
df_copy = df_copy.drop_duplicates().copy()
#now df_power_only has only the sum powers
# calculate avg power per inverter
df_copy["DC_POWER_PER_INVERTER"] = df_copy["TOTAL_DC_POWER"] / df_copy["OPERATING_INVERTERS"]
df_copy.drop("OPERATING_INVERTERS",axis=1, inplace=True)
df_copy.drop("TOTAL_DC_POWER",axis=1, inplace=True)

df_p1 = df_copy.copy()

print("Finished calculating new features for Plant "+plant)


plant = "2"
print ("Calculating Engineered Features for Plant "+plant+" ...")
df_copy = df_p2wg.copy()
df_copy.reset_index(inplace=True)
#make a new column to hold the sum of the DC_POWER
df_copy['TOTAL_DC_POWER'] = 0.0
df_copy['OPERATING_INVERTERS'] = 0.0

# get number of rows
n_rows = df_copy.shape[0]

#create a new dataframe to save redundant calculations
df_check = pd.DataFrame(columns= ["DATE_TIME", "TOTAL_DC_POWER", "OPERATING_INVERTERS"])
last_print = -1
# loop through all rows.
for ii in range(0,n_rows):
  percent_complete = round(100*ii/n_rows)
  if (percent_complete % 10 == 0):
    if percent_complete != last_print:
      print(str(percent_complete)+"% Complete")
      last_print = percent_complete
  ii_date_time = df_copy["DATE_TIME"].loc[ii] #get the date time for the current row ii
  check_mask = (df_check["DATE_TIME"] == ii_date_time) # make a mask to see if we have already encountered this datetime before
  already_calculated = np.dot(check_mask,check_mask) #use this mask to return a value of 1 or zero that we can put in an if statement
  if (already_calculated > 0.5): # use 0.5 not 1 or zero because floating points are evil i.e 0.9999999 != 1
    # well then we have already calulated this date_time value so we just get it drom df_check and log it in
    df_copy.loc[ii,"TOTAL_DC_POWER"] = np.dot(check_mask,df_check["TOTAL_DC_POWER"])
    df_copy.loc[ii,"OPERATING_INVERTERS"] = np.dot(check_mask,df_check["OPERATING_INVERTERS"])
  else:
    # we have not calculated this before
    # get a boolean mask of all the rows which have the same date time
    mask = (df_copy["DATE_TIME"] == ii_date_time).astype('float') # This gives us all the inverters operating at this datetime which we need to sum to get the total power
    total_power = np.dot(mask,df_copy['DC_POWER']) # take the dot product of the mask with all the rows for DC_POWER which will give us the sum 
    df_copy.loc[ii,"TOTAL_DC_POWER"] = total_power # assign it to the row ii
    total_inverters = np.dot(mask,mask) # take a dot product of the mask with itselft to count all the operating inverters
    df_copy.loc[ii,"OPERATING_INVERTERS"] = total_inverters # assign it to the row ii
    #now we shall add this information to our df_check so that we can use it if we need to
    temp = pd.DataFrame([[ii_date_time, total_power, total_inverters]], columns = ["DATE_TIME", "TOTAL_DC_POWER", "OPERATING_INVERTERS"])
    df_check = df_check.append(temp)

# now each each inverter at the same date_time will have the 'TOTAL_DC_POWER'
# drop the source keys
df_copy.drop("SOURCE_KEY",axis=1, inplace=True)
df_copy.drop("DC_POWER",axis=1, inplace=True)
df_copy.drop("MONTH_DAY",axis=1, inplace=True)

# Now each date_time has many duplicates. drop all the duplicates
df_copy = df_copy.drop_duplicates().copy()
#now df_power_only has only the sum powers
# calculate avg power per inverter
df_copy["DC_POWER_PER_INVERTER"] = df_copy["TOTAL_DC_POWER"] / df_copy["OPERATING_INVERTERS"]
df_copy.drop("OPERATING_INVERTERS",axis=1, inplace=True)
df_copy.drop("TOTAL_DC_POWER",axis=1, inplace=True)

df_p2 = df_copy.copy()

print("Finished calculating new features for Plant "+plant)


Calculating Engineered Features for Plant 1 ...
0% Complete
10% Complete
20% Complete
30% Complete
40% Complete
50% Complete
60% Complete
70% Complete
80% Complete
90% Complete
100% Complete
Finished calculating new features for Plant 1
Calculating Engineered Features for Plant 2 ...
0% Complete
10% Complete
20% Complete
30% Complete
40% Complete
50% Complete
60% Complete
70% Complete
80% Complete
90% Complete
100% Complete
Finished calculating new features for Plant 2


# Univariate and Bivariate Analysis of Features

## Plant 1

In [None]:
display(df_p1)

Unnamed: 0,DATE_TIME,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,DC_POWER_PER_INVERTER
0,2020-05-15 06:00:00,24.088446,22.206757,0.005887,44.864229
21,2020-05-15 06:15:00,24.011635,22.353459,0.022282,292.481009
42,2020-05-15 06:30:00,23.976731,22.893282,0.049410,694.143398
64,2020-05-15 06:45:00,24.218990,24.442444,0.095394,1301.208604
86,2020-05-15 07:00:00,24.537398,27.185653,0.141940,1879.359740
...,...,...,...,...,...
33874,2020-06-17 18:00:00,24.130349,25.080925,0.041940,605.090909
33896,2020-06-17 18:15:00,24.038157,24.068250,0.023446,338.664773
33918,2020-06-17 18:30:00,23.840239,22.968658,0.007007,78.116883
33940,2020-06-17 18:45:00,23.583049,22.460372,0.000039,0.000000


In [None]:
feature_names = list(df_p1.columns)
feature_names.remove('DATE_TIME')
for feature in feature_names:
  fig = px.histogram(df_p1, x=feature,  marginal="box", height=300)
  fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
  fig.show()

In [None]:
feature_names = list(df_p1.columns)
feature_names.remove('DATE_TIME')
m = len(df_p1)
n = len(df_p1.columns) - 1
fig = px.scatter_matrix(df_p1, dimensions=feature_names)
fig.update_layout(width=(n + 1) * 200,
                height = (n + 1) * 200,
                margin=dict(l=0, r=0, t=0, b=0))
fig.show()

In [None]:
fig = px.scatter(df_p1, x="AMBIENT_TEMPERATURE", y="DC_POWER_PER_INVERTER")
fig.show()
fig = px.scatter(df_p1, x="IRRADIATION", y="DC_POWER_PER_INVERTER")
fig.show()

## Plant 2

In [None]:
display(df_p2)

Unnamed: 0,DATE_TIME,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,DC_POWER_PER_INVERTER
0,2020-05-15 06:00:00,24.742787,24.077230,0.012962,15.381905
14,2020-05-15 06:15:00,24.763911,24.356189,0.021038,26.990884
28,2020-05-15 06:30:00,24.786323,24.548886,0.042435,58.639592
42,2020-05-15 06:45:00,25.048245,25.746662,0.103496,158.465238
56,2020-05-15 07:00:00,25.673202,28.674707,0.199450,310.450952
...,...,...,...,...,...
24970,2020-06-17 18:00:00,25.686604,26.540474,0.041514,60.496190
24984,2020-06-17 18:15:00,25.235443,25.656230,0.022150,31.967143
24998,2020-06-17 18:30:00,24.582472,24.786525,0.013393,19.410000
25012,2020-06-17 18:45:00,23.965923,23.283976,0.002635,3.430374


In [None]:
feature_names = list(df_p2.columns)
feature_names.remove('DATE_TIME')
for feature in feature_names:
  fig = px.histogram(df_p2, x=feature,  marginal="box", height=300)
  fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
  fig.show()

In [None]:
feature_names = list(df_p2.columns)
feature_names.remove('DATE_TIME')
m = len(df_p2)
n = len(df_p2.columns) - 1
fig = px.scatter_matrix(df_p2, dimensions=feature_names)
fig.update_layout(width=(n + 1) * 200,
                height = (n + 1) * 200,
                margin=dict(l=0, r=0, t=0, b=0))
fig.show()

# Note to team 6: df_p1, and df_p2 contain the cleaned, aligned, and engineered features to use for PM3

# PM3 Feature Scaling

In [None]:
# first lets just quickly make a DC power which is categorical to keep in line with kNNs
df = df_p1.copy()
# make categorical labels to use the kNN classifier
target = df["DC_POWER_PER_INVERTER"]

label_categories=["low", "medium", "high", "very_high", "unknown"]
label_type = pd.CategoricalDtype(categories=label_categories)

df["POWER_CLASS"] = "unknown"
df["POWER_CLASS"] = df["POWER_CLASS"].astype(label_type)

# Classifiers based on visual inspection of plot DC Power vs Ambient Temp
low_value_rows = (target < 4000)
medium_value_rows = (target >= 4000) & (target < 8000)
high_value_rows = (target >= 8000) & (target < 10000)
very_high_value_rows = (target >= 10000)

df.loc[low_value_rows, "POWER_CLASS"] = "low"
df.loc[medium_value_rows, "POWER_CLASS"] = "medium"
df.loc[high_value_rows, "POWER_CLASS"] = "high"
df.loc[very_high_value_rows, "POWER_CLASS"] = "very_high"


display(df)
fig = px.histogram(df, x="POWER_CLASS",  marginal="box", height=300)
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
fig.show()
# looks good!

#pass this back to ourselves to use.
df_p1 = df.copy()

Unnamed: 0,DATE_TIME,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,DC_POWER_PER_INVERTER,POWER_CLASS
0,2020-05-15 06:00:00,24.088446,22.206757,0.005887,44.864229,low
21,2020-05-15 06:15:00,24.011635,22.353459,0.022282,292.481009,low
42,2020-05-15 06:30:00,23.976731,22.893282,0.049410,694.143398,low
64,2020-05-15 06:45:00,24.218990,24.442444,0.095394,1301.208604,low
86,2020-05-15 07:00:00,24.537398,27.185653,0.141940,1879.359740,low
...,...,...,...,...,...,...
33874,2020-06-17 18:00:00,24.130349,25.080925,0.041940,605.090909,low
33896,2020-06-17 18:15:00,24.038157,24.068250,0.023446,338.664773,low
33918,2020-06-17 18:30:00,23.840239,22.968658,0.007007,78.116883,low
33940,2020-06-17 18:45:00,23.583049,22.460372,0.000039,0.000000,low


In [None]:
# Preprocessing
import sklearn.preprocessing

# first we need our time to be numeric in nature
df = df_p1.copy()
df['NUMERIC_TIME'] = 0.0
df['NUMERIC_TIME'] =  (df['DATE_TIME'] - pd.to_datetime(df['DATE_TIME']).dt.floor('d')).astype('timedelta64[m]').astype(float)
df = df[['NUMERIC_TIME'] + df.columns[:-1].tolist()].copy()
df.drop('DATE_TIME',axis=1, inplace=True)

df_original =df.copy()

# standardize with a standard scaler
df_standardized = df.copy()
scaler = sklearn.preprocessing.StandardScaler()
features = ["NUMERIC_TIME", "AMBIENT_TEMPERATURE", "MODULE_TEMPERATURE", "IRRADIATION"] # What about DC Power Inverter?
for feature in features:
  df_standardized[feature] = scaler.fit_transform(df[[feature]].values)
#display(df_standardized)

# Normalize with a min-max scaler
df_normalized = df.copy()
scaler = sklearn.preprocessing.MinMaxScaler()
features = ["NUMERIC_TIME", "AMBIENT_TEMPERATURE", "MODULE_TEMPERATURE", "IRRADIATION"]
for feature in features:
  df_normalized[feature] = scaler.fit_transform(df[[feature]].values)
#display(df_normalized)

#print all the results!
for feature in features:
  x1 = df_normalized[[feature]]
  x2 = df_standardized[[feature]]
  mean1 = round(np.mean(x1)[0],3)
  std1 = round(np.std(x1)[0],3)
  mean2 = round(np.mean(x2)[0],3)
  std2 = round(np.std(x2)[0],3)
  print(feature.ljust(20)+" Mean (Norm, Stand) : ("+str(mean1).ljust(5)+" , "+str(mean2).ljust(5)+")"\
        +" Standard Deviation (Norm, Stand) : ("+str(std1).ljust(5)+" , "+str(std2).ljust(5)+")")

df_original.name ='original'
df_normalized.name = 'normalized'
df_standardized.name = 'standardized'



NUMERIC_TIME         Mean (Norm, Stand) : (0.502 , 0.0  ) Standard Deviation (Norm, Stand) : (0.293 , 1.0  )
AMBIENT_TEMPERATURE  Mean (Norm, Stand) : (0.48  , -0.0 ) Standard Deviation (Norm, Stand) : (0.214 , 1.0  )
MODULE_TEMPERATURE   Mean (Norm, Stand) : (0.436 , -0.0 ) Standard Deviation (Norm, Stand) : (0.241 , 1.0  )
IRRADIATION          Mean (Norm, Stand) : (0.333 , 0.0  ) Standard Deviation (Norm, Stand) : (0.242 , 1.0  )


## Task 1 Check Extreme Values

* **Define what an extreme value is for each feature, e.g. using boxplots.**
See the boxplot/histograms in the Univariate and Bivariate section above.
* **Do any of your features contain extreme values? List the features & the number of extreme values for each feature.** For each feature there are high values, but they look like the end of the tail of the distribution such that they are not extremely high, or need to be considered as outliers.
* **Do the number of extreme values make-up a substantial portion of that feature? (e.g. 5%) Or are they so few that you could consider dropping those samples? Explain.** There are no high values to explain. But one extreme case to discuss is the amount of zero power measures during night, which do not provide information about the power producing ability of the plant. But as a value they make up a large portion of the data set. We decided to drop these night time values because they did not present any useful information. But they are a limiting value so we have noted it here.

## Task 2 Normalize

**Normalize your data and store this in a variable name that indicates that you’ve normalized the data, e.g. X_train_norm.**

The data was normalized and stored in a dataframe named 'df_normalized'

## Task 3 Standardize
**Standardize your data and store it in a variable name that indicates that you’ve normalized the data, e.g. X_train_test.**

The data was standardized and stored in a dataframe named 'df_standardized'

## Task 4 Visual & Numeric Comparison (1)
**Choose 3 features, and compare their standard
deviations of after standardizing & normalizing. What is the difference between these two approaches on how your features are distributed Hint: look at the mean. Students take-away: data are more concentrated around the mean if we scale data using Max-Min Normalization.** 


See the output from the cell above this text cell. The mean varies for the normalized features and has a much smaller standard deviation than in the case of standardization where the mean and standard deviation have been corrected to 0 and 1 respectively.

## Task 5 Visual & Numeric Comparison (2)
**Generate a boxplot for all three datasets, the original, the
normalized, and the standardized dataset. We expect to see a comparison like the one in lab 5-2, where each box-n-whiskers plot corresponds to a different feature.**

See the figures below where each feature has been plotted for the original, normalized, and standardized cases. Although the values have changed on the y-axis we can clearly see that the shape of the distributions have remained unchanged throughout the scaling.

In [None]:
from plotly.subplots import make_subplots
feats = ['NUMERIC_TIME', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE',
       'IRRADIATION']

for feat in feats:
  #feat  = 'AMBIENT_TEMPERATURE';
  fig = make_subplots(rows=1, cols=3)
  y_original = df_original[feat]
  y_normalized = df_normalized[feat]
  y_standardized = df_standardized[feat]
  fig.append_trace(go.Box(y=y_original, name="Original"), row=1, col=1)
  fig.append_trace(go.Box(y=y_normalized, name="Normalized"), row=1, col=2)
  fig.append_trace(go.Box(y=y_standardized,name="Standardized"), row=1, col=3)
  fig.update_layout(height=300, width=600, title_text="Boxplots for: "+feat)

  fig.show()

# PM3 Exploring kNN Construction:

## Classification

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from plotly.subplots import make_subplots

In [None]:
# function to make decision boundary plots.
# the example used matplotlib, but I decided to give it a try within the plotly framework.
def getfig(fig_title, df, knn, feat_x,feat_y,feat_z):
  color_dict ={
      "low": "blue",
      "medium": "skyblue",
      "high": "orange",
      "very_high": "red",
  }
  n_points = 40.0;
  df_c = df.copy()
  # make subplots
  col_i = 1
  z_avg = np.mean(df_c[feat_z])
  z_std = np.std(df_c[feat_z])
  print("Feature: "+feat_z+", Mean: "+str(z_avg)+", Std: "+str(z_std))
  
  x_min = np.min(df_c[feat_x])
  x_max = np.max(df_c[feat_x])
  x_step = (x_max-x_min)/n_points

  y_min = np.min(df_c[feat_y])
  y_max = np.max(df_c[feat_y])
  y_step = (y_max-y_min)/n_points

  fig = make_subplots(rows=1,cols=5,shared_yaxes=True, subplot_titles=(feat_z+' = \u03BC - 2\u03C3',feat_z+' = \u03BC - \u03C3',feat_z+' = \u03BC', feat_z+' = \u03BC + \u03C3',feat_z+' = \u03BC + 2\u03C3'))
  for z in [z_avg - 2*z_std,z_avg - z_std, z_avg, z_avg+z_std,z_avg + 2*z_std]:
    plot_columns = ["x","y","y_hat"]
    df_plot = pd.DataFrame(columns = plot_columns)
    for x in np.arange(x_min, x_max, x_step):
      for y in np.arange(y_min, y_max, y_step):
        # feats_x/y/z are not in the right order for the knn so we create
        # a dictionary (vals) to store the input data in and then use the
        # feature names to access the correct point
        vals = {}
        vals[feat_x] = x
        vals[feat_y] = y
        vals[feat_z] = z
        y_hat = knn.predict(np.array([vals["NUMERIC_TIME"],vals["AMBIENT_TEMPERATURE"],vals["IRRADIATION"]]).reshape(1, -1))
        temp_plot= pd.DataFrame([[x,y,y_hat[0]]], columns = plot_columns)
        df_plot = df_plot.append(temp_plot)
    cats = df_plot['y_hat'].unique()
    for cat in cats:
      df_temp = df_plot[df_plot['y_hat'] == cat]
      x_data = df_temp.x
      y_data = df_temp.y
      trace = go.Scatter(x=x_data, y=y_data, name=cat, mode='markers', marker=dict(color=color_dict[cat],size=5, symbol='square'), )
      fig.add_trace(trace, row = 1, col=col_i)
    col_i = col_i +1
  
  fig['layout']['xaxis']['title']=feat_x
  fig['layout']['xaxis2']['title']=feat_x
  fig['layout']['xaxis3']['title']=feat_x
  fig['layout']['xaxis4']['title']=feat_x
  fig['layout']['xaxis5']['title']=feat_x
  fig['layout']['yaxis']['title']=feat_y
  fig['layout']['yaxis2']['title']=''
  fig['layout']['yaxis3']['title']=''
  fig['layout']['yaxis4']['title']=''
  fig['layout']['yaxis5']['title']=''


  fig.update_layout(title=fig_title,height=350, width=1500,margin=dict(l=0, r=0, t=100, b=0))
  return fig


In [None]:
result_columns = ['Scaling','Weight','Neighbors','Train_Accuracy','Train_Precision','Train_f1','Train_recall','Test_Accuracy','Test_Precision','Test_f1','Test_recall']
df_results = pd.DataFrame(columns=result_columns)
weight_methods = ['uniform', 'distance']
df_original.name ='original'
df_normalized.name = 'normalized'
df_standardized.name = 'standardized'
#Select two features as feat_1 and feat_2
feat_1 = "NUMERIC_TIME"
feat_2 = "AMBIENT_TEMPERATURE"
feat_3 = "IRRADIATION"

for df_c in [df_original, df_normalized, df_standardized]:
  df = df_c.copy()
  df.name = df_c.name
  #split the dataset into a training set and a testing set
  df_train, df_test = train_test_split(df, test_size=0.2)
  X_train = df_train[[feat_1,feat_2,feat_3]]
  X_test = df_test[[feat_1,feat_2, feat_3]]

  #Make last column into feature into the output i.e. Label (y)
  y_train = df_train['POWER_CLASS']
  y_test = df_test['POWER_CLASS']

  for weight_method in weight_methods:
    for n in range(3,200):
      #print("kNN Neighbors: "+str(n))

      #make predictions
      knn = KNeighborsClassifier(n_neighbors=n, weights=weight_method)
      knn.fit(X_train, y_train) 
      yhat_train = knn.predict(X_train)
      yhat_test = knn.predict(X_test)

      #get metrics
      train_acc = accuracy_score(y_train, yhat_train)
      test_acc = accuracy_score(y_test, yhat_test)
      train_prec = precision_score(y_train, yhat_train, average='weighted')
      test_prec = precision_score(y_test, yhat_test, average='weighted')
      train_f1 = f1_score(y_train, yhat_train,average='weighted')
      test_f1 = f1_score(y_test, yhat_test,average='weighted')
      train_recall = recall_score(y_train, yhat_train,average='weighted')
      test_recall = recall_score(y_test, yhat_test,average='weighted')

      temp = pd.DataFrame([[df.name,weight_method, n, train_acc, train_prec, train_f1, train_recall, test_acc, test_prec, test_f1, test_recall ]], columns = result_columns)
      df_results = df_results.append(temp)

      if n == 5 or n == 11 or n == 15:
        fig_title = "Dataset Scaling: "+df.name+" Neighbors: "+str(n)+", Weighting Style: "+weight_method
        fig = getfig(fig_title, df, knn, feat_1, feat_2, feat_3)
        fig.show()
        fig = getfig('', df, knn, feat_1, feat_3, feat_2)
        fig.show()
        fig = getfig('', df, knn, feat_2, feat_3, feat_1)
        fig.show()
        
display(df_results)

In [None]:
result_columns = ['Scaling','Weight','Neighbors','Train_Accuracy','Train_Precision','Train_f1','Train_recall','Test_Accuracy','Test_Precision','Test_f1','Test_recall']
df_results = pd.DataFrame(columns=result_columns)
weight_methods = ['uniform', 'distance']
df_original.name ='original'
df_normalized.name = 'normalized'
df_standardized.name = 'standardized'
#Select two features as feat_1 and feat_2
feat_1 = "NUMERIC_TIME"
feat_2 = "AMBIENT_TEMPERATURE"
feat_3 = "IRRADIATION"

for df_c in [df_original, df_normalized, df_standardized]:
  df = df_c.copy()
  df.name = df_c.name
  #split the dataset into a training set and a testing set
  df_train, df_test = train_test_split(df, test_size=0.2)
  X_train = df_train[[feat_1,feat_2,feat_3]]
  X_test = df_test[[feat_1,feat_2, feat_3]]

  #Make last column into feature into the output i.e. Label (y)
  y_train = df_train['POWER_CLASS']
  y_test = df_test['POWER_CLASS']

  for weight_method in weight_methods:
    for n in range(3,200):
      #print("kNN Neighbors: "+str(n))

      #make predictions
      knn = KNeighborsClassifier(n_neighbors=n, weights=weight_method)
      knn.fit(X_train, y_train) 
      yhat_train = knn.predict(X_train)
      yhat_test = knn.predict(X_test)

      #get metrics
      train_acc = accuracy_score(y_train, yhat_train)
      test_acc = accuracy_score(y_test, yhat_test)
      train_prec = precision_score(y_train, yhat_train, average='weighted')
      test_prec = precision_score(y_test, yhat_test, average='weighted')
      train_f1 = f1_score(y_train, yhat_train,average='weighted')
      test_f1 = f1_score(y_test, yhat_test,average='weighted')
      train_recall = recall_score(y_train, yhat_train,average='weighted')
      test_recall = recall_score(y_test, yhat_test,average='weighted')

      temp = pd.DataFrame([[df.name,weight_method, n, train_acc, train_prec, train_f1, train_recall, test_acc, test_prec, test_f1, test_recall ]], columns = result_columns)
      df_results = df_results.append(temp)

        
display(df_results)
#correct the datatypes of result dataframe
for c in ['Scaling', 'Weight']:
  df_results[c] = df_results[c].astype(pd.CategoricalDtype(categories=df_results[c].unique()))
df_results["Neighbors"] = pd.to_numeric(df_results["Neighbors"], downcast='integer')

In [None]:
display(df_results)

Unnamed: 0,Scaling,Weight,Neighbors,Train_Accuracy,Train_Precision,Train_f1,Train_recall,Test_Accuracy,Test_Precision,Test_f1,Test_recall
0,original,uniform,3,0.833066,0.842811,0.836052,0.833066,0.679487,0.697830,0.686348,0.679487
0,original,uniform,4,0.793740,0.800628,0.794296,0.793740,0.698718,0.721820,0.705615,0.698718
0,original,uniform,5,0.769663,0.775534,0.771962,0.769663,0.698718,0.716359,0.706469,0.698718
0,original,uniform,6,0.759230,0.764391,0.760806,0.759230,0.673077,0.693524,0.680760,0.673077
0,original,uniform,7,0.749599,0.758966,0.753364,0.749599,0.666667,0.697703,0.679666,0.666667
...,...,...,...,...,...,...,...,...,...,...,...
0,standardized,distance,195,1.000000,1.000000,1.000000,1.000000,0.878205,0.874769,0.875333,0.878205
0,standardized,distance,196,1.000000,1.000000,1.000000,1.000000,0.878205,0.875887,0.875990,0.878205
0,standardized,distance,197,1.000000,1.000000,1.000000,1.000000,0.871795,0.868523,0.868646,0.871795
0,standardized,distance,198,1.000000,1.000000,1.000000,1.000000,0.878205,0.875887,0.875990,0.878205


In [None]:
#plot accuracy versus number of neighbors for the various weight methods and scaling types
fig = go.Figure()
df_temp = df_results[ df_results["Weight"] == 'uniform']
x_data = df_temp[df_temp["Scaling"] == 'standardized' ].Test_Accuracy
y_data = df_temp[df_temp["Scaling"] == 'original' ].Test_Accuracy
trace1 = go.Scatter(x=x_data, y=y_data, name="Uniform Weighting", mode='markers')
df_temp = df_results[ df_results["Weight"] == 'distance']
x_data = df_temp[df_temp["Scaling"] == 'standardized' ].Test_Accuracy
y_data = df_temp[df_temp["Scaling"] == 'original' ].Test_Accuracy
trace2 = go.Scatter(x=x_data, y=y_data, name="Distance Weighting",  mode='markers')
ref = go.Scatter(x=[0,1.0], y=[0,1.0], name="Reference Line", mode='lines',showlegend=False)
fig.add_trace(trace1)
fig.add_trace(trace2)
fig.add_trace(ref)
fig.update_layout(yaxis_range=[0.5,1])
fig.update_layout(xaxis_range=[0.5,1])
fig['layout']['xaxis']['title']="Standardized Dataset Accuracy"
fig['layout']['yaxis']['title']="Original Dataset Accuracy"
fig.update_layout(title="Scaling Comparison: Test Accuracy",height=300, width=400,margin=dict(l=0, r=0, t=100, b=0))
fig.show()

In [None]:
#plot accuracy versus number of neighbors for the various weight methods and scaling types
fig = go.Figure()
df_temp = df_results[ df_results["Weight"] == 'uniform']
x_data = df_temp[df_temp["Scaling"] == 'normalized' ].Test_Accuracy
y_data = df_temp[df_temp["Scaling"] == 'original' ].Test_Accuracy
trace1 = go.Scatter(x=x_data, y=y_data, name="Uniform Weighting", mode='markers')
df_temp = df_results[ df_results["Weight"] == 'distance']
x_data = df_temp[df_temp["Scaling"] == 'normalized' ].Test_Accuracy
y_data = df_temp[df_temp["Scaling"] == 'original' ].Test_Accuracy
trace2 = go.Scatter(x=x_data, y=y_data, name="Distance Weighting",  mode='markers')
ref = go.Scatter(x=[0,1.0], y=[0,1.0], name="Reference Line", mode='lines',showlegend=False)
fig.add_trace(trace1)
fig.add_trace(trace2)
fig.add_trace(ref)
fig.update_layout(yaxis_range=[0.5,1])
fig.update_layout(xaxis_range=[0.5,1])
fig['layout']['xaxis']['title']="Normalized Dataset"
fig['layout']['yaxis']['title']="Original Dataset"
fig.update_layout(title="Scaling Comparison: Test Accuracy",height=300, width=375,margin=dict(l=0, r=0, t=100, b=0))
fig.show()

In [None]:
#plot accuracy versus number of neighbors for the various weight methods and scaling types
for metric in ["Test_Accuracy", "Test_Precision", "Test_f1", "Test_recall"]:
  fig = go.Figure()
  df_temp = df_results[ df_results["Weight"] == 'uniform']
  x_data = df_temp[df_temp["Scaling"] == 'normalized' ][metric]
  y_data = df_temp[df_temp["Scaling"] == 'standardized' ][metric]
  trace1 = go.Scatter(x=x_data, y=y_data, name="Uniform Weighting", mode='markers')
  df_temp = df_results[ df_results["Weight"] == 'distance']
  x_data = df_temp[df_temp["Scaling"] == 'normalized' ][metric]
  y_data = df_temp[df_temp["Scaling"] == 'standardized' ][metric]
  trace2 = go.Scatter(x=x_data, y=y_data, name="Distance Weighting",  mode='markers')
  ref = go.Scatter(x=[0,1.0], y=[0,1.0], name="Reference Line", mode='lines',showlegend=False)
  fig.add_trace(trace1)
  fig.add_trace(trace2)
  fig.add_trace(ref)
  fig.update_layout(yaxis_range=[0.8,0.95])
  fig.update_layout(xaxis_range=[0.8,0.95])
  fig['layout']['xaxis']['title']="Normalized Dataset"
  fig['layout']['yaxis']['title']="Standardized Dataset"
  fig.update_layout(title="Scaling Comparison: "+metric,height=300, width=375,margin=dict(l=0, r=0, t=100, b=0))
  fig.show()

In [None]:
df_c  = df_results.copy()

df_uniform = df_c[df_c[["Weight"] == 'uniform']
df_distance = df_c[df_c[["Weight"] == 'distance']
df_uniform.sort_values(by=['col1'])

In [None]:
#plot accuracy versus number of neighbors for the various weight methods and scaling types
for c in ['Scaling', 'Weight']:
  df_results[c] = df_results[c].astype(pd.CategoricalDtype(categories=df_results[c].unique()))
df_results["Neighbors"] = pd.to_numeric(df_results["Neighbors"], downcast='integer')
for weight_method in ["uniform", "distance"]:
  for scaling_type in ["original", "normalized", "standardized"]:
    fig = go.Figure()
    df_temp = df_results[ df_results["Weight"] == weight_method]
    df_temp_scaling = df_temp[df_temp["Scaling"] == scaling_type ]
    a_line = dict(color='Red',width=1)
    trace1 = go.Scatter(x=df_temp_scaling.Neighbors, y=df_temp_scaling.Train_Accuracy, name="Train", mode='lines', line=a_line )
    a_line = dict(color='Blue',width=1)
    trace2 = go.Scatter(x=df_temp_scaling.Neighbors, y=df_temp_scaling.Test_Accuracy, name="Test", mode='lines', line=a_line )
    fig.add_trace(trace1)
    fig.add_trace(trace2)
    fig.update_layout(
          title="Dataset: "+scaling_type +" Weighting: "+weight_method,
          xaxis_title="Number of Neighbors",
          yaxis_title="Accuracy",
          height=300, width=500,margin=dict(l=0, r=0, t=100, b=0))
    fig.show()


In [None]:
#plot precision versus number of neighbors for the various weight methods and scaling types
for weight_method in ["uniform", "distance"]:
  for scaling_type in ["original", "normalized", "standardized"]:
    fig = go.Figure()
    df_temp = df_results[ df_results["Weight"] == weight_method]
    df_temp_scaling = df_temp[df_temp["Scaling"] == scaling_type ]
    a_line = dict(color='Red',width=1)
    trace1 = go.Scatter(x=df_temp_scaling.Neighbors, y=df_temp_scaling.Train_Precision, name="Train", mode='lines', line=a_line )
    a_line = dict(color='Blue',width=1)
    trace2 = go.Scatter(x=df_temp_scaling.Neighbors, y=df_temp_scaling.Test_Precision, name="Test", mode='lines', line=a_line )
    fig.add_trace(trace1)
    fig.add_trace(trace2)
    fig.update_layout(
          title="Dataset: "+scaling_type +" Weighting: "+weight_method,
          xaxis_title="Number of Neighbors",
          yaxis_title="Precision",
          height=300, width=500,margin=dict(l=0, r=0, t=100, b=0))
    fig.show()

In [None]:
#plot precision versus number of neighbors for the various weight methods and scaling types
for weight_method in ["uniform", "distance"]:
  for scaling_type in ["original", "normalized", "standardized"]:
    fig = go.Figure()
    df_temp = df_results[ df_results["Weight"] == weight_method]
    df_temp_scaling = df_temp[df_temp["Scaling"] == scaling_type ]
    a_line = dict(color='Red',width=1)
    trace1 = go.Scatter(x=df_temp_scaling.Neighbors, y=df_temp_scaling.Train_f1, name="Train", mode='lines', line=a_line )
    a_line = dict(color='Blue',width=1)
    trace2 = go.Scatter(x=df_temp_scaling.Neighbors, y=df_temp_scaling.Test_f1, name="Test", mode='lines', line=a_line )
    fig.add_trace(trace1)
    fig.add_trace(trace2)
    fig.update_layout(
          title="Dataset: "+scaling_type +" Weighting: "+weight_method,
          xaxis_title="Number of Neighbors",
          yaxis_title="F1 Score",
          height=300, width=500,margin=dict(l=0, r=0, t=100, b=0))
    fig.show()

In [None]:
#plot recall versus number of neighbors for the various weight methods and scaling types
for weight_method in ["uniform", "distance"]:
  for scaling_type in ["original", "normalized", "standardized"]:
    fig = go.Figure()
    df_temp = df_results[ df_results["Weight"] == weight_method]
    df_temp_scaling = df_temp[df_temp["Scaling"] == scaling_type ]
    a_line = dict(color='Red',width=1)
    trace1 = go.Scatter(x=df_temp_scaling.Neighbors, y=df_temp_scaling.Train_recall, name="Train", mode='lines', line=a_line )
    a_line = dict(color='Blue',width=1)
    trace2 = go.Scatter(x=df_temp_scaling.Neighbors, y=df_temp_scaling.Test_recall, name="Test", mode='lines', line=a_line )
    fig.add_trace(trace1)
    fig.add_trace(trace2)
    fig.update_layout(
          title="Dataset: "+scaling_type +" Weighting: "+weight_method,
          xaxis_title="Number of Neighbors",
          yaxis_title="Recall",
          height=300, width=500,margin=dict(l=0, r=0, t=100, b=0))
    fig.show()

1. Vary the n_neighbors parameter:
  * Create a kNN QuAM querying 5 neighbors.
  * Create a kNN QuAM querying 11 neighbors.
  * Create a kNN QuAM querying 15 neighbors.
2. Vary the weights parameter:
  * Create a kNN QuAM with weights parameter set to “uniform”.
  * Create a kNN QuAM with weights parameter set to “distance”.

### kNN Classification Evaluation:


We’ve built several QuAMs, but which is best? To determine the performance of our QuAMs we need to evaluate them using our accuracy classification score.

In multilabel classification, our accuracy score computes the subset accuracy: the set of labels
predicted for a sample must exactly match the corresponding set of labels in y_true. As we’re approximating, the accuracy score will return the fraction of correctly classified samples (float), or
it will return the number of correctly classified samples (int).
3. Plot the decision boundaries for each class for the different n_neighbors and weights. Note: For help plotting the classes’ decision boundaries see the following documentation, https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html

4. Use the sklearn.metrics.accuracy_score method to compare the accuracy of the several values of k, k=5, 11, and 15, and compare your original, your normalized, and your standardized datasets.
Accuracy can be computed by comparing the test set values and the predicted values, e.g.,Import scikit-learn metrics module for accuracy calculation from sklearn import metrics Model Accuracy: how often is the classifier correct? print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

5. Test the performance of each kNN using other evaluation metrics (precision, recall and F1-
measure) and see if there is any difference if you chose different metrics.

6. Imagine that you have another classifier that always predicts the most common label, do a
baseline comparison for your QuAMs.

7. Generate a simple plot of the test and training learning curve for each kNN and check if you
have a low/high bias or low/high variance model. We can use the function learning_curve to
generate the values that are required to plot such a learning curve (number of samples that
have been used, the average scores on the training sets and the average scores on the
validation sets):
from sklearn.model_selection import learning_curve

In [None]:
#Comparison with QuAM which always predicts the largest label.
# in this case yhat is just a column of 'low' values

for df_c in [df_original, df_normalized, df_standardized]:
  df = df_c.copy()
  df.name = df_c.name
  #split the dataset into a training set and a testing set
  df_train, df_test = train_test_split(df, test_size=0.2)
  y_test = df_test['POWER_CLASS']
  baseline_yhat_test = y_test.copy()
  baseline_yhat_test[:] = 'low'
  print((df.name+" Accuracy: ").ljust(30)+str(accuracy_score(y_test, baseline_yhat_test)))
  print((df.name+" Precision: ").ljust(30)+str(precision_score(y_test, baseline_yhat_test,average='weighted')))
  print((df.name+" F1 score: ").ljust(30)+str(f1_score(y_test, baseline_yhat_test,average='weighted')))
  print((df.name+" Recall score: ").ljust(30)+str(recall_score(y_test, baseline_yhat_test,average='weighted')))


#baseline_test_acc = accuracy_score(y_test, baseline_yhat_test)
#baseline_test_prec = precision_score(y_test, baseline_yhat_test, average='weighted')
#baseline_test_f1 = f1_score(y_test, baseline_yhat_test,average='weighted')
#baseline_test_recall = recall_score(y_test, baseline_yhat_test,average='weighted')
#
#print("Accuracy: ".ljust(20)+""+str(baseline_test_acc))


original Accuracy:            0.41025641025641024
original Precision:           0.16831032215647598
original F1 score:            0.23869463869463872
original Recall score:        0.41025641025641024
normalized Accuracy:          0.375
normalized Precision:         0.140625
normalized F1 score:          0.20454545454545453
normalized Recall score:      0.375
standardized Accuracy:        0.4230769230769231
standardized Precision:       0.17899408284023668
standardized F1 score:        0.2515592515592515
standardized Recall score:    0.4230769230769231



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
#Use Grid Search Cross Validation to find the best model for the standardized set
from sklearn.model_selection import GridSearchCV
#for scorer in ['accuracy', 'precision_weighted','f1_weighted','recall_weighted']:
for scorer in ['accuracy']:
  print("Score Method: "+scorer)
  for weight in ['uniform','distance']:
    print("   Weight: "+weight)    
    for df_c in [df_standardized, df_normalized]:
      print("      Scaling: "+df_c.name)
    
      df = df_c.copy()
      df.name = df_c.name
      #separated data for 5 kFold cross validations
      X = df[["NUMERIC_TIME","AMBIENT_TEMPERATURE","IRRADIATION"]]
      y = df["POWER_CLASS"]
      #create new a knn model
      knn2 = KNeighborsClassifier(weights=weight)
      #create a dictionary of all values we want to test for n_neighbors
      param_grid = {'n_neighbors': np.arange(1, 200)}
      #use gridsearch to test all values for n_neighbors
      knn_gscv = GridSearchCV(knn2, param_grid, cv=5, scoring=scorer)
      #fit model to data
      knn_gscv.fit(X, y)
      #check top performing n_neighbors value
      print("         Best N_neighbors: "+str(knn_gscv.best_params_))
      #check mean score for the top performing value of n_neighbors
      print("         Best Score Vale: "+str(knn_gscv.best_score_))


Score Method: accuracy
   Weight: uniform
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 9}
         Best Score Vale: 0.9127092093330036
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 9}
         Best Score Vale: 0.9152815565998846
   Weight: distance
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 11}
         Best Score Vale: 0.9101368620661227
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 9}
         Best Score Vale: 0.9101430455932065


In [None]:
#Use Grid Search Cross Validation to find the best model for the standardized set
from sklearn.model_selection import GridSearchCV
for scorer in ['accuracy']:
  print("Score Method: "+scorer)
  for weight in ['uniform','distance']:
    print("   Weight: "+weight)    
    for df_c in [df_standardized, df_normalized]:
      print("      Scaling: "+df_c.name)
    
      df = df_c.copy()
      df.name = df_c.name
      #separated data for 5 kFold cross validations
      X = df[["AMBIENT_TEMPERATURE"]]
      y = df["POWER_CLASS"]
      #create new a knn model
      knn2 = KNeighborsClassifier(weights=weight)
      #create a dictionary of all values we want to test for n_neighbors
      param_grid = {'n_neighbors': np.arange(1, 250)}
      #use gridsearch to test all values for n_neighbors
      knn_gscv = GridSearchCV(knn2, param_grid, cv=5, scoring=scorer)
      #fit model to data
      knn_gscv.fit(X, y)
      #check top performing n_neighbors value
      print("         Best N_neighbors: "+str(knn_gscv.best_params_))
      #check mean score for the top performing value of n_neighbors
      print("         Best Score Value: "+str(knn_gscv.best_score_))

Score Method: accuracy
   Weight: uniform
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 83}
         Best Score Value: 0.449317750845082
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 83}
         Best Score Value: 0.449317750845082
   Weight: distance
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 89}
         Best Score Value: 0.42302745486025223
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 89}
         Best Score Value: 0.42302745486025223


In [None]:
#Use Grid Search Cross Validation to find the best model for the standardized set
from sklearn.model_selection import GridSearchCV
for scorer in ['accuracy']:
  print("Score Method: "+scorer)
  for weight in ['uniform','distance']:
    print("   Weight: "+weight)    
    for df_c in [df_standardized, df_normalized]:
      print("      Scaling: "+df_c.name)
    
      df = df_c.copy()
      df.name = df_c.name
      #separated data for 5 kFold cross validations
      X = df[["NUMERIC_TIME","IRRADIATION"]]
      y = df["POWER_CLASS"]
      #create new a knn model
      knn2 = KNeighborsClassifier(weights=weight)
      #create a dictionary of all values we want to test for n_neighbors
      param_grid = {'n_neighbors': np.arange(1, 200)}
      #use gridsearch to test all values for n_neighbors
      knn_gscv = GridSearchCV(knn2, param_grid, cv=5, scoring=scorer)
      #fit model to data
      knn_gscv.fit(X, y)
      #check top performing n_neighbors value
      print("         Best N_neighbors: "+str(knn_gscv.best_params_))
      #check mean score for the top performing value of n_neighbors
      print("         Best Score Value: "+str(knn_gscv.best_score_))

Score Method: accuracy
   Weight: uniform
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 77}
         Best Score Value: 0.9281061917717865
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 29}
         Best Score Value: 0.9249093082694368
   Weight: distance
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 145}
         Best Score Value: 0.9210487261934208
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 125}
         Best Score Value: 0.9172005111715723


In [None]:
from sklearn.model_selection import learning_curve

for scorer in ['accuracy', 'precision_weighted','f1_weighted','recall_weighted']:
  df = df_normalized.copy()
  df.name = df_standardized.name
  #split the dataset into a training set and a testing set
  df_train, df_test = train_test_split(df, test_size=0.2)
  X_train = df_train[[feat_1,feat_2, feat_3]]
  X_test = df_test[[feat_1,feat_2, feat_3]]

  #Make last column into feature into the output i.e. Label (y)
  y_train = df_train['POWER_CLASS']
  y_test = df_test['POWER_CLASS']

  data_sizes, training_scores, validation_scores = \
  learning_curve(KNeighborsClassifier(n_neighbors=9), X_train, \
                 y_train, cv=10, scoring=scorer, \
                 train_sizes=np.linspace(0.1, 1.0, 100))
  training_mean = training_scores.mean(axis=1) 
  training_standard_deviation = training_scores.std(axis=1) 
  validation_mean = validation_scores.mean(axis=1) 
  validation_standard_deviation = validation_scores.std(axis=1)
  fig = go.Figure()

  fig.add_trace(go.Scatter(x=data_sizes,  y=training_mean,mode='lines',name='Training',line=dict(color='red')))
  fig.add_trace(go.Scatter(x=data_sizes,y=training_mean - training_standard_deviation,mode='lines',name='Training lower bound',line=dict(width=0, color='red'),showlegend=False))
  fig.add_trace(go.Scatter(x=data_sizes,y=training_mean + training_standard_deviation,mode='lines',name='Training upper bound',line=dict(width=0, color='red'),fill='tonexty',fillcolor='rgba(255, 0, 0, 0.3)',showlegend=False))

  fig.add_trace(go.Scatter(x=data_sizes, y=validation_mean,mode='lines',name='Validation',line=dict(color='blue')))
  fig.add_trace(go.Scatter(x=data_sizes, y=validation_mean - validation_standard_deviation,mode='lines',name='Validation lower bound',line=dict(width=0, color='blue'),showlegend=False))
  fig.add_trace(go.Scatter(x=data_sizes, y=validation_mean + validation_standard_deviation,mode='lines',name='Validation upper bound',line=dict(width=0, color='blue'),fill='tonexty',fillcolor='rgba(0, 0, 255, 0.3)',showlegend=False))

  fig.update_layout(title="Learning curve for "+df.name+" dateset",xaxis_title='Dataset size',yaxis_title=scorer)
  fig.show()


## kNN Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
from plotly.subplots import make_subplots

In [None]:
result_columns = ['Scaling','Weight','Neighbors','Train_RMSE','Train_MAE','Train_R2','Test_RMSE','Test_MAE','Test_R2']
df_results = pd.DataFrame(columns=result_columns)
weight_methods = ['uniform', 'distance']
df_original.name ='original'
df_normalized.name = 'normalized'
df_standardized.name = 'standardized'
#Select two features as feat_1 and feat_2
feat_1 = "NUMERIC_TIME"
feat_2 = "AMBIENT_TEMPERATURE"
feat_3 = "IRRADIATION"

for df_c in [df_original, df_normalized, df_standardized]:
  df = df_c.copy()
  df.name = df_c.name
  #split the dataset into a training set and a testing set
  df_train, df_test = train_test_split(df, test_size=0.2)
  X_train = df_train[[feat_1,feat_2,feat_3]]
  X_test = df_test[[feat_1,feat_2, feat_3]]

  #Make last column into feature into the output i.e. Label (y)
  y_train = df_train['DC_POWER_PER_INVERTER']
  y_test = df_test['DC_POWER_PER_INVERTER']

  for weight_method in weight_methods:
    for n in range(3,125):
      #print("kNN Neighbors: "+str(n))

      #make predictions
      knn = KNeighborsRegressor(n_neighbors=n, weights=weight_method)
      knn.fit(X_train, y_train) 
      yhat_train = knn.predict(X_train)
      yhat_test = knn.predict(X_test)

      #get metrics

      rmse_train = metrics.mean_squared_error(y_train, yhat_train, squared=False)
      rmse_test  = metrics.mean_squared_error(y_test, yhat_test, squared=False)

      mae_train = metrics.mean_absolute_error(y_train, yhat_train)
      mae_test  = metrics.mean_absolute_error(y_test, yhat_test)

      r2_train = metrics.r2_score(y_train, yhat_train)
      r2_test  = metrics.r2_score(y_test, yhat_test)

      

      temp = pd.DataFrame([[df.name,weight_method, n, rmse_train, mae_train, r2_train, rmse_test, mae_test, r2_test ]], columns = result_columns)
      df_results = df_results.append(temp)

      if n == 5 or n == 11 or n == 15:
        #plot accuracy versus number of neighbors for the various weight methods and scaling types
        fig = go.Figure()
        x_data = y_test
        y_data = yhat_test
        trace1 = go.Scatter(x=x_data, y=y_data, name="Test Data", mode='markers')
        x_data = y_train
        y_data = yhat_train
        trace2 = go.Scatter(x=x_data, y=y_data, name="Train",  mode='markers')
        ref = go.Scatter(x=[0,1.0], y=[0,1.0], name="Reference Line", mode='lines',showlegend=False)
        fig.add_trace(trace1)
        fig.add_trace(trace2)
        fig.add_trace(ref)
        #fig.update_layout(yaxis_range=[0.0,1])
        #fig.update_layout(xaxis_range=[0.0,1])
        fig['layout']['xaxis']['title']="Measured"
        fig['layout']['yaxis']['title']="Predicted"
        fig.update_layout(title="Neighbors: "+str(n)+" Weight Method: "+weight_method+" Scaling: "+df.name,height=300, width=400,margin=dict(l=0, r=0, t=100, b=0))
        fig.show()
for c in ['Scaling', 'Weight']:
  df_results[c] = df_results[c].astype(pd.CategoricalDtype(categories=df_results[c].unique()))
df_results["Neighbors"] = pd.to_numeric(df_results["Neighbors"], downcast='integer')
        
display(df_results)

In [None]:
#plot RMSE fpr standardized and original data set
def getfig(metric, min_val, max_val):
  fig = go.Figure()
  df_temp = df_results[ df_results["Scaling"] == 'normalized']
  x_data = df_temp[df_temp["Weight"] == 'uniform'][metric]
  y_data = df_temp[df_temp["Weight"] == 'distance'][metric]
  trace1 = go.Scatter(x=x_data, y=y_data, name="Normalized Data", mode='markers')
  df_temp = df_results[df_results["Scaling"] == 'standardized']
  x_data = df_temp[df_temp["Weight"] == 'uniform'][metric]
  y_data = df_temp[df_temp["Weight"] == 'distance'][metric]
  trace2 = go.Scatter(x=x_data, y=y_data, name="Standardized Data",  mode='markers')
  ref = go.Scatter(x=[0,max_val], y=[0,max_val], name="Reference Line", mode='lines',showlegend=False)
  fig.add_trace(trace1)
  fig.add_trace(trace2)
  fig.add_trace(ref)
  fig.update_layout(yaxis_range=[min_val,max_val])
  fig.update_layout(xaxis_range=[min_val,max_val])
  fig['layout']['xaxis']['title']="Uniform Weighting"
  fig['layout']['yaxis']['title']="Distance Weighting"
  fig.update_layout(title="Weighting Comparison: "+metric,height=300, width=400,margin=dict(l=0, r=0, t=100, b=0))
  return fig

fig = getfig('Test_RMSE',0,800)
fig.show()
fig = getfig('Test_MAE',0,800)
fig.show()
fig = getfig('Test_R2',0.95,1)
fig.show()


In [None]:
def getfig(metric, min_val, max_val):
  fig = go.Figure()
  df_temp = df_results[ df_results["Scaling"] == 'original']
  x_data = df_temp[df_temp["Weight"] == 'uniform'][metric]
  y_data = df_temp[df_temp["Weight"] == 'distance'][metric]
  trace1 = go.Scatter(x=x_data, y=y_data, name="Normalized Data", mode='markers')
  ref = go.Scatter(x=[0,max_val], y=[0,max_val], name="Reference Line", mode='lines',showlegend=False)
  fig.add_trace(trace1)
  fig.add_trace(ref)
  fig.update_layout(yaxis_range=[min_val,max_val])
  fig.update_layout(xaxis_range=[min_val,max_val])
  fig['layout']['xaxis']['title']="Uniform Weighting"
  fig['layout']['yaxis']['title']="Distance Weighting"
  fig.update_layout(title="Weighting Comparison: "+metric,height=300, width=400,margin=dict(l=0, r=0, t=100, b=0))
  return fig

fig = getfig('Test_RMSE',1400,1800)
fig.show()
fig = getfig('Test_MAE',1000,1400)
fig.show()
fig = getfig('Test_R2',0,1)
fig.show()

In [None]:
metric = 'Test_R2'
def getfig(x_scaling, y_scaling, min_val, max_val):
  fig = go.Figure()
  df_temp = df_results[ df_results["Weight"] == 'uniform']
  x_data = df_temp[df_temp["Scaling"] == x_scaling ][metric]
  y_data = df_temp[df_temp["Scaling"] == y_scaling ][metric]
  trace1 = go.Scatter(x=x_data, y=y_data, name="Uniform Weighting", mode='markers')
  df_temp = df_results[ df_results["Weight"] == 'distance']
  x_data = df_temp[df_temp["Scaling"] == x_scaling ][metric]
  y_data = df_temp[df_temp["Scaling"] == y_scaling ][metric]
  trace2 = go.Scatter(x=x_data, y=y_data, name="Distance Weighting",  mode='markers')
  ref = go.Scatter(x=[0,1.0], y=[0,1.0], name="Reference Line", mode='lines',showlegend=False)
  fig.add_trace(trace1)
  fig.add_trace(trace2)
  fig.add_trace(ref)
  fig.update_layout(yaxis_range=[min_val,max_val])
  fig.update_layout(xaxis_range=[min_val,max_val])
  fig['layout']['xaxis']['title']="Dataset: "+x_scaling
  fig['layout']['yaxis']['title']="Dataset: "+y_scaling
  fig.update_layout(title="Scaling Comparison: "+metric,height=300, width=375,margin=dict(l=0, r=0, t=100, b=0))
  return fig
fig = getfig('normalized','standardized',0.95,1.0)
fig.show()
fig = getfig('normalized','original',0.75,1.0)
fig.show()
fig = getfig('standardized','original',0.75,1.0)
fig.show()

In [None]:
#plot RMSE versus number of neighbors for the various weight methods and scaling types
for weight_method in ["uniform", "distance"]:
  for scaling_type in ["original", "normalized", "standardized"]:
    fig = go.Figure()
    df_temp = df_results[ df_results["Weight"] == weight_method]
    df_temp_scaling = df_temp[df_temp["Scaling"] == scaling_type ]
    a_line = dict(color='Red',width=1)
    trace1 = go.Scatter(x=df_temp_scaling.Neighbors, y=df_temp_scaling.Train_RMSE, name="Train", mode='lines', line=a_line )
    a_line = dict(color='Blue',width=1)
    trace2 = go.Scatter(x=df_temp_scaling.Neighbors, y=df_temp_scaling.Test_RMSE, name="Test", mode='lines', line=a_line )
    fig.add_trace(trace1)
    fig.add_trace(trace2)
    fig.update_layout(
          title="Dataset: "+scaling_type +", Weighting: "+weight_method,
          xaxis_title="Number of Neighbors",
          yaxis_title="RMSE",
          height=300, width=500,margin=dict(l=0, r=0, t=100, b=0))
    fig.show()


In [None]:
#plot MAE versus number of neighbors for the various weight methods and scaling types
for weight_method in ["uniform", "distance"]:
  for scaling_type in ["original", "normalized", "standardized"]:
    fig = go.Figure()
    df_temp = df_results[ df_results["Weight"] == weight_method]
    df_temp_scaling = df_temp[df_temp["Scaling"] == scaling_type ]
    a_line = dict(color='Red',width=1)
    trace1 = go.Scatter(x=df_temp_scaling.Neighbors, y=df_temp_scaling.Train_MAE, name="Train", mode='lines', line=a_line )
    a_line = dict(color='Blue',width=1)
    trace2 = go.Scatter(x=df_temp_scaling.Neighbors, y=df_temp_scaling.Test_MAE, name="Test", mode='lines', line=a_line )
    fig.add_trace(trace1)
    fig.add_trace(trace2)
    fig.update_layout(
          title="Dataset: "+scaling_type +", Weighting: "+weight_method,
          xaxis_title="Number of Neighbors",
          yaxis_title="MAE",
          height=300, width=500,margin=dict(l=0, r=0, t=100, b=0))
    fig.show()

In [None]:
#plot R2 versus number of neighbors for the various weight methods and scaling types
for weight_method in ["uniform", "distance"]:
  for scaling_type in ["original", "normalized", "standardized"]:
    fig = go.Figure()
    df_temp = df_results[ df_results["Weight"] == weight_method]
    df_temp_scaling = df_temp[df_temp["Scaling"] == scaling_type ]
    a_line = dict(color='Red',width=1)
    trace1 = go.Scatter(x=df_temp_scaling.Neighbors, y=df_temp_scaling.Train_R2, name="Train", mode='lines', line=a_line )
    a_line = dict(color='Blue',width=1)
    trace2 = go.Scatter(x=df_temp_scaling.Neighbors, y=df_temp_scaling.Test_R2, name="Test", mode='lines', line=a_line )
    fig.add_trace(trace1)
    fig.add_trace(trace2)
    fig.update_layout(
          title="Dataset: "+scaling_type +", Weighting: "+weight_method,
          xaxis_title="Number of Neighbors",
          yaxis_title="R2",
          height=300, width=500,margin=dict(l=0, r=0, t=100, b=0))
    fig.show()

1. Vary the n_neighbors parameter:
  * Create a kNN QuAM querying 5 neighbors.
  * Create a kNN QuAM querying 11 neighbors.
  * Create a kNN QuAM querying 15 neighbors.
2. Vary the weights parameter:
  * Create a kNN QuAM with weights parameter set to “uniform”.
  * Create a kNN QuAM with weights parameter set to “distance”.

### kNN Classification Evaluation:


We’ve built several QuAMs, but which is best? To determine the performance of our QuAMs we need to evaluate them using our accuracy classification score.

In multilabel classification, our accuracy score computes the subset accuracy: the set of labels
predicted for a sample must exactly match the corresponding set of labels in y_true. As we’re approximating, the accuracy score will return the fraction of correctly classified samples (float), or
it will return the number of correctly classified samples (int).
3. Plot the decision boundaries for each class for the different n_neighbors and weights. Note: For help plotting the classes’ decision boundaries see the following documentation, https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html

4. Use the sklearn.metrics.accuracy_score method to compare the accuracy of the several values of k, k=5, 11, and 15, and compare your original, your normalized, and your standardized datasets.
Accuracy can be computed by comparing the test set values and the predicted values, e.g.,Import scikit-learn metrics module for accuracy calculation from sklearn import metrics Model Accuracy: how often is the classifier correct? print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

5. Test the performance of each kNN using other evaluation metrics (precision, recall and F1-
measure) and see if there is any difference if you chose different metrics.

6. Imagine that you have another classifier that always predicts the most common label, do a
baseline comparison for your QuAMs.

7. Generate a simple plot of the test and training learning curve for each kNN and check if you
have a low/high bias or low/high variance model. We can use the function learning_curve to
generate the values that are required to plot such a learning curve (number of samples that
have been used, the average scores on the training sets and the average scores on the
validation sets):
from sklearn.model_selection import learning_curve

In [None]:
for df_c in [df_standardized, df_normalized, df_original]:
      print("      Scaling: "+df_c.name)

      Scaling: standardized
      Scaling: normalized
      Scaling: original


In [None]:
#Use Grid Search Cross Validation to find the best model for the standardized set
from sklearn.model_selection import GridSearchCV
for scorer in ['neg_root_mean_squared_error', 'neg_mean_absolute_error','r2']:
  print("Score Method: "+scorer)
  for weight in ['uniform','distance']:
    print("   Weight: "+weight)    
    for df_c in [df_standardized, df_normalized, df_original]:
      print("      Scaling: "+df_c.name)
    
      df = df_c.copy()
      df.name = df_c.name
      #separated data for 5 kFold cross validations
      X = df[["NUMERIC_TIME","AMBIENT_TEMPERATURE","IRRADIATION"]]
      y = df["DC_POWER_PER_INVERTER"]
      #create new a knn model
      knn2 = KNeighborsRegressor(weights=weight)
      #create a dictionary of all values we want to test for n_neighbors
      param_grid = {'n_neighbors': np.arange(1, 25)}
      #use gridsearch to test all values for n_neighbors
      knn_gscv = GridSearchCV(knn2, param_grid, cv=5, scoring=scorer)
      #fit model to data
      knn_gscv.fit(X, y)
      #check top performing n_neighbors value
      print("         Best N_neighbors: "+str(knn_gscv.best_params_))
      #check mean score for the top performing value of n_neighbors
      print("         Best Score Vale: "+str(knn_gscv.best_score_))


Score Method: neg_root_mean_squared_error
   Weight: uniform
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 8}
         Best Score Vale: -417.9320989145027
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 9}
         Best Score Vale: -424.284722920471
      Scaling: original
         Best N_neighbors: {'n_neighbors': 2}
         Best Score Vale: -1579.4175676767359
   Weight: distance
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 8}
         Best Score Vale: -409.5307570128976
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 9}
         Best Score Vale: -415.2736052122822
      Scaling: original
         Best N_neighbors: {'n_neighbors': 4}
         Best Score Vale: -1491.6072329529015
Score Method: neg_mean_absolute_error
   Weight: uniform
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 8}
         Best Score Vale: -291.9709135723669
      Scaling: normalized
         Best 

In [None]:
from sklearn.model_selection import learning_curve

for scorer in ['neg_root_mean_squared_error', 'neg_mean_absolute_error','r2']:
  df = df_standardized.copy()
  df.name = df_standardized.name
  #split the dataset into a training set and a testing set
  df_train, df_test = train_test_split(df, test_size=0.2)
  X_train = df_train[[feat_1,feat_2, feat_3]]
  X_test = df_test[[feat_1,feat_2, feat_3]]

  #Make last column into feature into the output i.e. Label (y)
  y_train = df_train['DC_POWER_PER_INVERTER']
  y_test = df_test['DC_POWER_PER_INVERTER']

  data_sizes, training_scores, validation_scores = \
  learning_curve(KNeighborsRegressor(n_neighbors=2), X_train, \
                 y_train, cv=5, scoring=scorer, \
                 train_sizes=np.linspace(0.1, 1.0, 500))
  training_mean = training_scores.mean(axis=1) 
  training_standard_deviation = training_scores.std(axis=1) 
  validation_mean = validation_scores.mean(axis=1) 
  validation_standard_deviation = validation_scores.std(axis=1)
  fig = go.Figure()

  fig.add_trace(go.Scatter(x=data_sizes,  y=training_mean,mode='lines',name='Training',line=dict(color='red')))
  fig.add_trace(go.Scatter(x=data_sizes,y=training_mean - training_standard_deviation,mode='lines',name='Training lower bound',line=dict(width=0, color='red'),showlegend=False))
  fig.add_trace(go.Scatter(x=data_sizes,y=training_mean + training_standard_deviation,mode='lines',name='Training upper bound',line=dict(width=0, color='red'),fill='tonexty',fillcolor='rgba(255, 0, 0, 0.3)',showlegend=False))

  fig.add_trace(go.Scatter(x=data_sizes, y=validation_mean,mode='lines',name='Validation',line=dict(color='blue')))
  fig.add_trace(go.Scatter(x=data_sizes, y=validation_mean - validation_standard_deviation,mode='lines',name='Validation lower bound',line=dict(width=0, color='blue'),showlegend=False))
  fig.add_trace(go.Scatter(x=data_sizes, y=validation_mean + validation_standard_deviation,mode='lines',name='Validation upper bound',line=dict(width=0, color='blue'),fill='tonexty',fillcolor='rgba(0, 0, 255, 0.3)',showlegend=False))

  fig.update_layout(title="Learning curve for "+df.name+" dateset",xaxis_title='Dataset size',yaxis_title=scorer)
  fig.show()


In [None]:
from sklearn.model_selection import learning_curve

for scorer in ['neg_root_mean_squared_error', 'neg_mean_absolute_error','r2']:
  df = df_standardized.copy()
  df.name = df_standardized.name
  #split the dataset into a training set and a testing set
  df_train, df_test = train_test_split(df, test_size=0.2)
  X_train = df_train[[feat_1,feat_2, feat_3]]
  X_test = df_test[[feat_1,feat_2, feat_3]]

  #Make last column into feature into the output i.e. Label (y)
  y_train = df_train['DC_POWER_PER_INVERTER']
  y_test = df_test['DC_POWER_PER_INVERTER']

  data_sizes, training_scores, validation_scores = \
  learning_curve(KNeighborsRegressor(n_neighbors=50, weights='distance'), X_train, \
                 y_train, cv=5, scoring=scorer, \
                 train_sizes=np.linspace(0.1, 1.0, 500))
  training_mean = training_scores.mean(axis=1) 
  training_standard_deviation = training_scores.std(axis=1) 
  validation_mean = validation_scores.mean(axis=1) 
  validation_standard_deviation = validation_scores.std(axis=1)
  fig = go.Figure()

  fig.add_trace(go.Scatter(x=data_sizes,  y=training_mean,mode='lines',name='Training',line=dict(color='red')))
  fig.add_trace(go.Scatter(x=data_sizes,y=training_mean - training_standard_deviation,mode='lines',name='Training lower bound',line=dict(width=0, color='red'),showlegend=False))
  fig.add_trace(go.Scatter(x=data_sizes,y=training_mean + training_standard_deviation,mode='lines',name='Training upper bound',line=dict(width=0, color='red'),fill='tonexty',fillcolor='rgba(255, 0, 0, 0.3)',showlegend=False))
  display(training_mean)
  fig.add_trace(go.Scatter(x=data_sizes, y=validation_mean,mode='lines',name='Validation',line=dict(color='blue')))
  fig.add_trace(go.Scatter(x=data_sizes, y=validation_mean - validation_standard_deviation,mode='lines',name='Validation lower bound',line=dict(width=0, color='blue'),showlegend=False))
  fig.add_trace(go.Scatter(x=data_sizes, y=validation_mean + validation_standard_deviation,mode='lines',name='Validation upper bound',line=dict(width=0, color='blue'),fill='tonexty',fillcolor='rgba(0, 0, 255, 0.3)',showlegend=False))

  fig.update_layout(title="Learning curve for "+df.name+" dateset",xaxis_title='Dataset size',yaxis_title=scorer)
  fig.show()
  


array([-0.00109157, -0.00107795,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

array([-0.00026764, -0.0002621 ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [None]:
#Use Grid Search Cross Validation to find the best model for the standardized set
from sklearn.model_selection import GridSearchCV
for scorer in ['r2']:
  print("Score Method: "+scorer)
  for weight in ['uniform','distance']:
    print("   Weight: "+weight)    
    for df_c in [df_standardized, df_normalized]:
      print("      Scaling: "+df_c.name)
    
      df = df_c.copy()
      df.name = df_c.name
      #separated data for 5 kFold cross validations
      X = df[["NUMERIC_TIME","AMBIENT_TEMPERATURE","IRRADIATION"]]
      y = df["DC_POWER_PER_INVERTER"]
      #create new a knn model
      knn2 = KNeighborsRegressor(weights=weight)
      #create a dictionary of all values we want to test for n_neighbors
      param_grid = {'n_neighbors': np.arange(1, 250)}
      #use gridsearch to test all values for n_neighbors
      knn_gscv = GridSearchCV(knn2, param_grid, cv=5, scoring=scorer)
      #fit model to data
      knn_gscv.fit(X, y)
      #check top performing n_neighbors value
      print("         Best N_neighbors: "+str(knn_gscv.best_params_))
      #check mean score for the top performing value of n_neighbors
      print("         Best Score Vale: "+str(knn_gscv.best_score_))

Score Method: r2
   Weight: uniform
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 8}
         Best Score Vale: 0.9880099944368966
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 9}
         Best Score Vale: 0.9876044986302357
   Weight: distance
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 8}
         Best Score Vale: 0.9884856349591196
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 9}
         Best Score Vale: 0.9881232280356655


In [None]:
#Use Grid Search Cross Validation to find the best model for the standardized set
from sklearn.model_selection import GridSearchCV
for scorer in ['neg_root_mean_squared_error','r2']:
  print("Score Method: "+scorer)
  for weight in ['uniform','distance']:
    print("   Weight: "+weight)    
    for df_c in [df_standardized, df_normalized]:
      print("      Scaling: "+df_c.name)
    
      df = df_c.copy()
      df.name = df_c.name
      #separated data for 5 kFold cross validations
      X = df[["NUMERIC_TIME","IRRADIATION"]]
      y = df["DC_POWER_PER_INVERTER"]
      #create new a knn model
      knn2 = KNeighborsRegressor(weights=weight)
      #create a dictionary of all values we want to test for n_neighbors
      param_grid = {'n_neighbors': np.arange(1, 250)}
      #use gridsearch to test all values for n_neighbors
      knn_gscv = GridSearchCV(knn2, param_grid, cv=5, scoring=scorer)
      #fit model to data
      knn_gscv.fit(X, y)
      #check top performing n_neighbors value
      print("         Best N_neighbors: "+str(knn_gscv.best_params_))
      #check mean score for the top performing value of n_neighbors
      print("         Best Score Vale: "+str(knn_gscv.best_score_))

Score Method: neg_root_mean_squared_error
   Weight: uniform
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 12}
         Best Score Vale: -355.4193985949868
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 15}
         Best Score Vale: -364.97030588243365
   Weight: distance
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 28}
         Best Score Vale: -357.263278187845
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 21}
         Best Score Vale: -368.2881532482276
Score Method: r2
   Weight: uniform
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 14}
         Best Score Vale: 0.9913401357690674
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 15}
         Best Score Vale: 0.9908441229695774
   Weight: distance
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 28}
         Best Score Vale: 0.99124449783752
      Scaling: normalized
     

In [None]:
#Use Grid Search Cross Validation to find the best model for the standardized set
from sklearn.model_selection import GridSearchCV
for scorer in ['r2']:
  print("Score Method: "+scorer)
  for weight in ['uniform','distance']:
    print("   Weight: "+weight)    
    for df_c in [df_standardized, df_normalized]:
      print("      Scaling: "+df_c.name)
    
      df = df_c.copy()
      df.name = df_c.name
      #separated data for 5 kFold cross validations
      X = df[["IRRADIATION"]]
      y = df["DC_POWER_PER_INVERTER"]
      #create new a knn model
      knn2 = KNeighborsRegressor(weights=weight)
      #create a dictionary of all values we want to test for n_neighbors
      param_grid = {'n_neighbors': np.arange(1, 250)}
      #use gridsearch to test all values for n_neighbors
      knn_gscv = GridSearchCV(knn2, param_grid, cv=5, scoring=scorer)
      #fit model to data
      knn_gscv.fit(X, y)
      #check top performing n_neighbors value
      print("         Best N_neighbors: "+str(knn_gscv.best_params_))
      #check mean score for the top performing value of n_neighbors
      print("         Best Score Vale: "+str(knn_gscv.best_score_))

Score Method: r2
   Weight: uniform
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 31}
         Best Score Vale: 0.9913111359397913
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 31}
         Best Score Vale: 0.9913111359397913
   Weight: distance
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 89}
         Best Score Vale: 0.9897990620028434
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 89}
         Best Score Vale: 0.9897990620028434


In [None]:
#Use Grid Search Cross Validation to find the best model for the standardized set
from sklearn.model_selection import GridSearchCV
for scorer in ['r2']:
  print("Score Method: "+scorer)
  for weight in ['uniform','distance']:
    print("   Weight: "+weight)    
    for df_c in [df_standardized, df_normalized]:
      print("      Scaling: "+df_c.name)
    
      df = df_c.copy()
      df.name = df_c.name
      #separated data for 5 kFold cross validations
      X = df[["NUMERIC_TIME"]]
      y = df["DC_POWER_PER_INVERTER"]
      #create new a knn model
      knn2 = KNeighborsRegressor(weights=weight)
      #create a dictionary of all values we want to test for n_neighbors
      param_grid = {'n_neighbors': np.arange(1, 250)}
      #use gridsearch to test all values for n_neighbors
      knn_gscv = GridSearchCV(knn2, param_grid, cv=5, scoring=scorer)
      #fit model to data
      knn_gscv.fit(X, y)
      #check top performing n_neighbors value
      print("         Best N_neighbors: "+str(knn_gscv.best_params_))
      #check mean score for the top performing value of n_neighbors
      print("         Best Score Vale: "+str(knn_gscv.best_score_))

Score Method: r2
   Weight: uniform
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 71}
         Best Score Vale: 0.7760844692005819
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 73}
         Best Score Vale: 0.7761046570513109
   Weight: distance
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 25}
         Best Score Vale: 0.7733343175775802
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 25}
         Best Score Vale: 0.7733343175775802


In [None]:
#Use Grid Search Cross Validation to find the best model for the standardized set
from sklearn.model_selection import GridSearchCV
for scorer in ['r2']:
  print("Score Method: "+scorer)
  for weight in ['uniform','distance']:
    print("   Weight: "+weight)    
    for df_c in [df_standardized, df_normalized]:
      print("      Scaling: "+df_c.name)
    
      df = df_c.copy()
      df.name = df_c.name
      #separated data for 5 kFold cross validations
      X = df[["NUMERIC_TIME", "AMBIENT_TEMPERATURE"]]
      y = df["DC_POWER_PER_INVERTER"]
      #create new a knn model
      knn2 = KNeighborsRegressor(weights=weight)
      #create a dictionary of all values we want to test for n_neighbors
      param_grid = {'n_neighbors': np.arange(1, 250)}
      #use gridsearch to test all values for n_neighbors
      knn_gscv = GridSearchCV(knn2, param_grid, cv=5, scoring=scorer)
      #fit model to data
      knn_gscv.fit(X, y)
      #check top performing n_neighbors value
      print("         Best N_neighbors: "+str(knn_gscv.best_params_))
      #check mean score for the top performing value of n_neighbors
      print("         Best Score Vale: "+str(knn_gscv.best_score_))

Score Method: r2
   Weight: uniform
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 30}
         Best Score Vale: 0.8139516821751874
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 34}
         Best Score Vale: 0.8187066981186402
   Weight: distance
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 50}
         Best Score Vale: 0.8050850075642509
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 55}
         Best Score Vale: 0.807311160210007


In [None]:
#Use Grid Search Cross Validation to find the best model for the standardized set
from sklearn.model_selection import GridSearchCV
for scorer in ['r2']:
  print("Score Method: "+scorer)
  for weight in ['uniform','distance']:
    print("   Weight: "+weight)    
    for df_c in [df_standardized, df_normalized]:
      print("      Scaling: "+df_c.name)
    
      df = df_c.copy()
      df.name = df_c.name
      #separated data for 5 kFold cross validations
      X = df[["AMBIENT_TEMPERATURE"]]
      y = df["DC_POWER_PER_INVERTER"]
      #create new a knn model
      knn2 = KNeighborsRegressor(weights=weight)
      #create a dictionary of all values we want to test for n_neighbors
      param_grid = {'n_neighbors': np.arange(1, 250)}
      #use gridsearch to test all values for n_neighbors
      knn_gscv = GridSearchCV(knn2, param_grid, cv=5, scoring=scorer)
      #fit model to data
      knn_gscv.fit(X, y)
      #check top performing n_neighbors value
      print("         Best N_neighbors: "+str(knn_gscv.best_params_))
      #check mean score for the top performing value of n_neighbors
      print("         Best Score Vale: "+str(knn_gscv.best_score_))

Score Method: r2
   Weight: uniform
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 124}
         Best Score Vale: 0.3236006874264138
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 124}
         Best Score Vale: 0.3236006874264138
   Weight: distance
      Scaling: standardized
         Best N_neighbors: {'n_neighbors': 249}
         Best Score Vale: 0.2292173566711532
      Scaling: normalized
         Best N_neighbors: {'n_neighbors': 249}
         Best Score Vale: 0.22921735667113166
