<a href="https://colab.research.google.com/github/kpjaskie/SenSIP-IRES2020/blob/master/SolarData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Load Solar Data

In this section, we will load the solar data into Colab and put it into two arrays - one, named X, will contain all of the known data with 10 features, and the other, y, will contain the class numbers for each datapoint in X

In [0]:
#%matplotlib inline
%matplotlib notebook
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import numpy as np
import pandas as pd

In [25]:
from google.colab import files
uploaded = files.upload()

Saving asu_solar_dataset.csv to asu_solar_dataset (1).csv


In [0]:
data_path = '/content/asu_solar_dataset.csv'
df = pd.read_csv(data_path, delimiter=',')  #here we're reading our csv file into a Pandas DataFrame

In [27]:
# This is what our dataframe looks like to begin with
df.loc[0:2, :]  # We're only looking at the first three rows

Unnamed: 0,DCArrayOutput_W_,Vmp,Imp,CellTemperature_C_,PlaneOfArrayIrradiance_W_m_2_,Fill Factor,Gamma,Pmp,Voc,Isc,Degraded,Shaded,Soiled,SC,STC
0,4715.409,36.331723,1.369981,16.107,281.111,4.669695,0.177061,49.773762,44.338856,5.24209,0,0,0,0,1
1,9076.926,37.436438,2.559325,26.713,527.216,2.391698,0.181732,95.811997,44.023777,5.205219,0,0,0,0,1
2,11978.364,35.924148,3.51959,36.542,712.905,1.913899,0.177356,126.438287,45.239121,5.349134,0,0,0,0,1


In [28]:
# We're going to start by separating the data from the labels.  Here, 
# we delete the labels to isolate the data into a new dataframe

df_X = df.drop(columns=['Degraded', 'Shaded', 'Soiled', 'SC', 'STC'])
df_X #.loc[0:2,:] #Look at the dataframe to verify the 

Unnamed: 0,DCArrayOutput_W_,Vmp,Imp,CellTemperature_C_,PlaneOfArrayIrradiance_W_m_2_,Fill Factor,Gamma,Pmp,Voc,Isc
0,4715.409,36.331723,1.369981,16.107,281.11100,4.669695,0.177061,49.773762,44.338856,5.242090
1,9076.926,37.436438,2.559325,26.713,527.21600,2.391698,0.181732,95.811997,44.023777,5.205219
2,11978.364,35.924148,3.519590,36.542,712.90500,1.913899,0.177356,126.438287,45.239121,5.349134
3,13606.691,35.296861,4.069092,40.619,820.48600,1.637532,0.175050,143.626183,44.804407,5.249316
4,13953.936,34.555717,4.262436,45.436,861.07700,1.629109,0.171055,147.291547,44.492927,5.393082
...,...,...,...,...,...,...,...,...,...,...
21480,9918.107,32.878335,3.184198,56.338,648.85575,2.231807,0.161347,104.691129,44.903990,5.203332
21481,9498.023,32.857103,3.051301,56.476,622.77825,2.355273,0.160983,100.256909,44.016109,5.364680
21482,8470.261,34.315388,2.605487,46.998,531.26175,2.607679,0.168294,89.408311,44.419803,5.248744
21483,6168.314,34.718963,1.875344,44.375,389.89800,3.547519,0.166992,65.109981,44.582852,5.180891


In [29]:
# Here, we drop the data to obtain the labels.  We put them in their own dataframe.

df_y = df.drop(columns=['DCArrayOutput_W_', 'Vmp', 'Imp', 'CellTemperature_C_', 
                        'PlaneOfArrayIrradiance_W_m_2_', 'Fill Factor', 'Gamma',
                        'Pmp', 'Voc', 'Isc'])
df_y #.loc[:3,:] #Labels in a one-hot encoding

Unnamed: 0,Degraded,Shaded,Soiled,SC,STC
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1
...,...,...,...,...,...
21480,0,1,0,0,0
21481,0,1,0,0,0
21482,0,1,0,0,0
21483,0,1,0,0,0


In [30]:
#Now we convert the data into a numpy array - many of the algorithms we're 
#interested in will only take numpy arrays, not dataframes
X = df_X.to_numpy()
X[:3,:] #check to make sure it converted it properly

array([[4.71540900e+03, 3.63317230e+01, 1.36998076e+00, 1.61070000e+01,
        2.81111000e+02, 4.66969485e+00, 1.77060882e-01, 4.97737617e+01,
        4.43388563e+01, 5.24209008e+00],
       [9.07692600e+03, 3.74364378e+01, 2.55932461e+00, 2.67130000e+01,
        5.27216000e+02, 2.39169826e+00, 1.81731959e-01, 9.58119967e+01,
        4.40237770e+01, 5.20521867e+00],
       [1.19783640e+04, 3.59241479e+01, 3.51959042e+00, 3.65420000e+01,
        7.12905000e+02, 1.91389893e+00, 1.77356431e-01, 1.26438287e+02,
        4.52391213e+01, 5.34913355e+00]])

In [31]:
# Data labels are originally stored in a one-hot encoding, meaning that each 
# column represents a fault type and contains either a zero or a 1
y_onehot = df_y.to_numpy()
y_onehot[:3,:]

array([[0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1]])

In [32]:
# We will covert the one-hot encoding into integer class numbers.
y = np.argmax(y_onehot, axis=1)

# Here, 0 = degraded
#       1 = shaded
#       2 = soiled
#       3 = short circuit
#       4 = standard test conditions (no faults)

print(y)
print(y.shape)

[4 4 4 ... 1 1 1]
(21485,)


# Visualize the Data

In order to visualize the 10 dimensional data effectively, we're going to perform a dimensionality reduction algorithm called PCA (Principal Component Analysis).  This uses Singular Value Decomposition to identify the eigenvectors in the data, and removes all but the most important.

In [0]:
from sklearn.decomposition import PCA

# Transform the data into a dimensionality reduced dataset in 3 dimensions
pca_model3 = PCA(n_components=3)
X_red3 = pca_model3.fit_transform(X)  #X_red is short for X_reduced3

In [34]:
X_red3[1:3,:]

array([[ 1.80730409e+03, -1.28337250e+02, -5.74421433e+00],
       [ 4.71475626e+03, -1.50077960e+02,  4.17291284e+00]])

In [35]:
# Here, we can plot the reduced data in three dimensions

import plotly.graph_objects as go

fig = go.Figure(data=[go.Scatter3d(x=X_red3[:,0], y=X_red3[:,1], z=X_red3[:,2], 
                                   mode='markers', 
                                   marker=dict(
                                       size=5,
                                       color=y,                
                                       opacity=0.8))])

fig.show()

#Notice that 

Hmmm.... It looks like the third dimension doesn't really add that much.  Let's see what happens if we plot it in only two dimensions

In [36]:
# Transform the data into a dimensionality reduced dataset in 2 dimensions
pca_model2 = PCA(n_components=2)
X_red2 = pca_model2.fit_transform(X)  #X_red2 is short for X_reduced2

plt.scatter(X_red2[:, 0], X_red2[:, 1], s=50, c=y)


#Legend:
#   purple = degraded
#   blue = shaded
#   cyan = soiled
#   green = short circuit
#   yellow = STC (no faults or problems)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x7f6a3bc2d358>

#Classification

First, let's identify which classes we will be comparing.

Let's start with classifying soiled vs STC (no problem data)

In [64]:
X_soiled = X[y == 2]
X_STC = X[y == 4]

y_soiled = np.zeros((X_soiled.shape[0], ))
y_STC = np.ones((X_STC.shape[0], ))

print('X_soiled.shape: ', X_soiled.shape)
print('X_STC.shape: ', X_STC.shape)

X_bin = np.concatenate((X_soiled, X_STC), axis=0)
y_bin = np.concatenate((y_soiled, y_STC), axis=0)

print("\nX_bin.shape: ", X_bin.shape)
print("y_bin.shape: ", y_bin.shape)

X_soiled.shape:  (4297, 10)
X_STC.shape:  (4297, 10)

X_bin.shape:  (8594, 10)
y_bin.shape:  (8594,)


Break the data into training, validation, and test sets

In [65]:
from sklearn.model_selection import train_test_split

x_train, x_test_and_val, y_train, y_test_and_val = train_test_split(X_bin, y_bin, test_size=0.3)
x_test, x_val, y_test, y_val = train_test_split(x_test_and_val, y_test_and_val, test_size=0.5)

print("train_size = ", x_train.shape[0])
print("val_size = ", x_val.shape[0])
print("test_size = ",  x_test.shape[0])

train_size =  6015
val_size =  1290
test_size =  1289


In [66]:
from sklearn.linear_model import LogisticRegression
from scipy import stats
from sklearn.metrics import accuracy_score

#Train and test on validation set
LR_model = LogisticRegression(random_state=0)

sim_model = LR_model.fit(x_train, y_train)
y_hat = sim_model.predict(x_val)

accuracy = accuracy_score(y_val, y_hat)
print('Validation accuracy = ', accuracy)

Validation accuracy =  0.9744186046511628



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [67]:

#Test accuracy
x_train_val = np.concatenate((x_train, x_val), axis=0)
y_train_val = np.concatenate((y_train, y_val), axis=0)
sim_model = LR_model.fit(x_train_val, y_train_val)

y_hat_test = sim_model.predict(x_test)
accuracy = accuracy_score(y_test, y_hat_test)
print('Test accuracy = ', accuracy)



Test accuracy =  0.9728471683475562


Multi-class classification

In [59]:
from sklearn.model_selection import train_test_split

x_train, x_test_and_val, y_train, y_test_and_val = train_test_split(X, y, test_size=0.3)
x_test, x_val, y_test, y_val = train_test_split(x_test_and_val, y_test_and_val, test_size=0.5)

print("train_size = ", x_train.shape[0])
print("val_size = ", x_val.shape[0])
print("test_size = ",  x_test.shape[0])

train_size =  15039
val_size =  3223
test_size =  3223
