## Preamble

### Import libraries

In [1]:
import io, os, sys, types, subprocess, collections

# Import Pandas
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Import numpy
import numpy as np

# Import SciKit decision tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz

# Import Scikit cross-validation function
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV

# Import naive bayes
from sklearn.naive_bayes import BernoulliNB

## Step 1:

### Import CSV containing photovoltaic performance of solar cells into Pandas Data Frame object

In [2]:
# Import module to read in secure data
sys.path.append('../data/NREL')
import retrieve_data as rd

In [3]:
solar = rd.retrieve_dirks_sheet()

## Step 2:

### Clean the data for inconsistencies

In [4]:
sys.path.append('utils')
import process_data as prd

In [5]:
prd.clean_data(solar)

In [10]:
a = True
b = True
import collections

for mode in mode_dictionary.keys():
    print(mode)
    for index, row in solar.iterrows():
        #print(row['Cause (Cleaned)'])
        a = False
        b = False
        if isinstance(row['Cause (Cleaned)'], collections.MutableSequence):
            if isinstance(mode in row['Cause (Cleaned)'], collections.MutableSequence):
                if any(mode in row['Cause (Cleaned)']):
                    a = True
            else:
                if (mode in row['Cause (Cleaned)']):
                    a = True
        if row['Cause ' + mode]:
            b = True
        if (not a == b):
            print(index)

Minor delamination
Encapsulant discoloration
Glass breakage
Internal circuitry failure
Backsheet other
Major delamination
Permanent soiling
Frame deformation
Fractured cells
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
Hot spots
10883
Backsheet insulation compromise
Diode/J-box problem
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
494

In [15]:
solar.loc[5214]['Cause']

u'29 interconnect;9 delam;5 burns;4 solder;4 discolor;2 hotspots 1diode'

### Decision trees - Setup data and build models

### Naive Bayes - Setup data and build models

In [10]:
# Selected climates to use for fitting decision trees
climates = ['Moderate', 'Desert', 'Hot & Humid', 'Snow']
# Selected mountings to use for fitting decision trees
mountings = ['rack', '1-axis tracker', 'roof rack', 'roof']
# Selected causes to use as a target for generated decision trees
causes = ['Hot spots', 'Encapsulant discoloration', 'Major delamination', 'Internal circuitry discoloration',
         'Fractured cells', 'Glass breakage', 'Permanent soiling', 'Diode/J-box problem']

In [15]:
# Create binary dummy data for each of the categorical variables
naive_df = pd.DataFrame(solar.loc[:, 'Mounting'])
naive_df = naive_df.join(pd.get_dummies(solar['Mounting']))
naive_df = naive_df.join(solar.loc[:, 'Climate3'])
naive_df = naive_df.join(pd.get_dummies(solar['Climate3']))

# Bin the installation year data into binary values (0: <2000, 1: >=2000)
naive_df = naive_df.join(solar.loc[:, 'Begin.Year'])
naive_df['Begin.Year'].fillna(naive_df['Begin.Year'].mean(), inplace=True)
bins = [0, 2000, 9999]
group_names = [0, 1]
naive_df['After 2000'] = pd.cut(naive_df['Begin.Year'], bins, labels=group_names)

# Add the cleaned Cause column for visual reference
naive_df = naive_df.join(solar.loc[:, 'Cause (Cleaned)'])

# Add the dummy variable columns of the degradation modes
for c in causes:
    naive_df = pd.concat([naive_df, solar[c]], axis=1)
    
naive_df.tail()

Unnamed: 0,Mounting,1-axis tracker,2-axis tracker,façade/BIPV,rack,roof,roof rack,single-axis,Climate3,Desert,...,After 2000,Cause (Cleaned),Hot spots,Encapsulant discoloration,Major delamination,Internal circuitry discoloration,Fractured cells,Glass breakage,Permanent soiling,Diode/J-box problem
11041,single-axis,0,0,0,0,0,0,1,Snow,0,...,1,"[extrusion, disc, del, Glass breakage, Hot spots]",1,0,0,0,0,1,0,0
11042,single-axis,0,0,0,0,0,0,1,Snow,0,...,1,"[extrusion, disc, del, Glass breakage, Hot spots]",1,0,0,0,0,1,0,0
11043,single-axis,0,0,0,0,0,0,1,Snow,0,...,1,"[extrusion, disc, del, Glass breakage, Hot spots]",1,0,0,0,0,1,0,0
11044,single-axis,0,0,0,0,0,0,1,Snow,0,...,1,"[extrusion, disc, del, Glass breakage, Hot spots]",1,0,0,0,0,1,0,0
11045,single-axis,0,0,0,0,0,0,1,Snow,0,...,1,"[extrusion, disc, del, Glass breakage, Hot spots]",1,0,0,0,0,1,0,0


In [17]:
# Build a dictionary to hold all Bernoulli Naive Bayes models
# Format: {Key:Degradation mode, Value:Naive Bayes model}
nb_dict = {}

# Columns to fit against the target for the Naive Bayes models
X = naive_df.loc[:, ['1-axis tracker', '2-axis tracker', 'rack', 'roof', 'roof rack',
                 'single-axis', 'Desert', 'Hot & Humid', 'Moderate', 'Snow', 'After 2000']]

# Find score of Bernoulli Naive Bayes models for each degradation mode
for c in causes:
    y = naive_df[c]
    clf = BernoulliNB()
    clf.fit(X, y)
    print('Cause: ' + c)
    print('Score: ' + str(clf.score(X, y)))
    nb_dict[c] = clf

Cause: Hot spots
Score: 0.646659424226
Cause: Encapsulant discoloration
Score: 0.84990041644
Cause: Major delamination
Score: 0.769328263625
Cause: Internal circuitry discoloration
Score: 0.75484338222
Cause: Fractured cells
Score: 0.958627557487
Cause: Glass breakage
Score: 0.963154082926
Cause: Permanent soiling
Score: 0.983885569437
Cause: Diode/J-box problem
Score: 0.761271048343


In [41]:
nb_dict['Hot spots'].score(X,naive_df['Hot spots'])

0.64674995473474561

In [42]:
X.head()

Unnamed: 0,1-axis tracker,2-axis tracker,rack,roof,roof rack,single-axis,Desert,Hot & Humid,Moderate,Snow,After 2000
0,0,0,1,0,0,0,0,0,1,0,0
1,0,0,1,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0


In [43]:
clf.score(X, y)
# look into how it is calculated

0.95419156255658155

In [44]:
# Posterior probability with 1-axis tracker and Desert and BEFORE 2000
clf.predict_proba([[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]])

array([[ 0.34092551,  0.65907449]])

In [45]:
nb_dict['Hot spots'].predict_proba([[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]])

array([[ 0.12839453,  0.87160547]])

In [46]:
clf.predict([[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]])
#confusion matrix

array([1])

In [47]:
# Posterior probability with 1-axis tracker and Desert and AFTER 2000
clf.predict_proba([[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1]])

array([[ 0.97942813,  0.02057187]])

### Testing area below

In [48]:
# Import naive bayes
from sklearn.naive_bayes import GaussianNB

X = naive_df.loc[:, ['1-axis tracker', '2-axis tracker', 'rack', 'roof', 'roof rack',
                 'single-axis', 'Desert', 'Hot & Humid', 'Moderate', 'Snow', 'Begin.Year']]
y = naive_df['Hot spots']
clf = GaussianNB()
clf.fit(X, y)

GaussianNB()

In [49]:
clf.predict_proba(X)

array([[  9.79212906e-01,   2.07870941e-02],
       [  9.79212906e-01,   2.07870941e-02],
       [  1.00000000e+00,   7.15535245e-23],
       ..., 
       [  0.00000000e+00,   1.00000000e+00],
       [  0.00000000e+00,   1.00000000e+00],
       [  0.00000000e+00,   1.00000000e+00]])

In [50]:
clf.predict(X)

array([0, 0, 0, ..., 1, 1, 1])