# Classifier Training

***
## Notes

 - http://www.deeplearningbook.org/contents/guidelines.html
 - Determine goals:
 - Establish end-to-end working pipeline:
 - Instrument  the system to determine bottlenecks in performance. Diagnose which components are performing worse, find over/underfitting, defect in data, etc.
 - Repeatedly make incremental changes (gathering new data, adjusting hyperparameters, changing algorithms) based on findings from instrumentation

## Download exported tables locally for use with TensorFlow

In [1]:
# Authenticate Google Drive access
# following guide: https://gsuitedevs.github.io/PyDrive/docs/build/html/quickstart.html
# Need client_secrets.json file for auth to work -- I put it in 'notebooks' folder, but how can we put it somewhere more appropriate and still have the auth find it?
from pydrive.auth import GoogleAuth

gauth = GoogleAuth()
gauth.LocalWebserverAuth()

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=207798239714-s97urn6kn6eqj2li0l2k8ud9fudlqqbf.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


In [3]:
from pydrive.drive import GoogleDrive
drive = GoogleDrive(gauth)

# list files in root folder
file_list = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()

# Get file id of export folder we're saving tables to
exportFolderID = [file['id'] for file in file_list if file['title']=='GEE_exports'][0] 

# lists files in export folder
file_list = drive.ListFile({'q': "'" + exportFolderID + "' in parents and trashed=false"}).GetList()
for file1 in file_list:
    print('title: %s, id: %s' % (file1['title'], file1['id']))
    
# Save all files locally
for i in range(len(file_list)):
    file_list[i].GetContentFile('../Data/' + file_list[i]['title'])

title: sample_points_test_2013to15.csv, id: 1isO3p6bkSivCO50jNVEaAxSLzxhknqCf
title: sample_points.csv, id: 1v-tsHqTdEMPVgasYd5SzPfxtBBSb3LgO


## Playing with TensorFlow

In [None]:
import numpy as np
import tensorflow as tf

## Random Forest

## Neural Network

## SVM

In [5]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import GridSearchCV

['sample_points.csv', 'sample_points_test_2013to15.csv']


34000

In [25]:
# CSV of sample points with features
import os
files = os.listdir("../Data")
print(files)
points = pd.DataFrame()
for i in range(len(files)):
    new_points = pd.read_csv("../Data/" + files[i])
    points = points.append(new_points, ignore_index=True)

df = points.copy()
dropCols = pd.concat([df.pop(x) for x in ['imageID', 'latitude','longitude']]) # Pop out imageID, lat, lon so we can check NaN and inf. 

# Check presence of infinite or NaN values
print(' Is NaN?:', np.any(np.isnan(df)), "\n",
     'Is inf?:', not np.any(np.isfinite(df)))

# Remove NaN
df = df.dropna()

# Recode landcover categories as dummy variables
# Probably want to remove impervious as well, or collapse some of the landcover types
nlcd_dict = {11:'open_water', # Dict of landcover values
             12:'perm_snowIce',
             21:'dev_openSpace',
             22:'dev_lowInt',
             23:'dev_medInt',
             24:'dev_highInt',
             31:'barren',
             41:'decid_forest',
             42:'evergreen_forest',
             43:'mixed_forest',
             51:'dwarf_scrub',
             52:'shrub',
             71:'grassland',
             72:'sedge',
             73:'lichens',
             74:'moss',
             81:'pasture',
             82:'crops',
             90:'woody_wetlands',
             95:'emergent_wetlands'}

df = df.replace({'landcover':nlcd_dict})

dummies = pd.get_dummies(df['landcover']).rename(columns=lambda x: str(x)) # Convert categorical feature to dummy vars
df = pd.concat([df, dummies], axis=1) # Concat with other features
df = df.drop(['landcover'], axis=1) # Drop categorical landcover feature
# df = df.reset_index()

# Rescale values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
# df_scaled = pd.concat([df_scaled, pd.DataFrame(imageID)]) # Add imageID back. Converting to NaN when I concat so leaving out for now

print(df_scaled)

# Separate features and target
Y = df.pop('flooded') # Target feature, drop from features
X = df # Features

# Split data into training and testing sets (split once, use it )
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

# Remove rows with NaN
X_train.dropna()
X_test.dropna()
y_train.dropna()
y_test.dropna()

print('Training data and target sizes: \n{}, {}'.format(X_train.shape,y_train.shape))
print('Test data and target sizes: \n{}, {}'.format(X_test.shape,y_test.shape))

from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

# Grid of values for tuning hyperparameters
# C is the 

grid = [{'C': [0.0001, 0.001, 0.01, 0.1, 1.0],
         'tol': [0.0001, 0.001, 0.01, 0.1, 1.0]}]

model = LinearSVC()

gridcv = GridSearchCV(estimator=model, param_grid=grid, return_train_score=True)
gridcv.fit(X_train, y_train)
print('Support Vector Machine Best Score: ' + str(gridcv.best_score_))
print('Best C: ' + str(gridcv.best_estimator_.C))
print('Best tol: ' + str(gridcv.best_estimator_.tol))

['sample_points.csv', 'sample_points_test_2013to15.csv']
 Is NaN?: True 
 Is inf?: False
         aspect     curve  elevation  flooded      hand  percent_tree_cover  \
0      0.109698  0.479518   0.165285      0.0  0.000000                0.00   
1      0.860799  0.453012   0.243296      0.0  0.005270                0.00   
2      0.777279  0.479518   0.464164      0.0  0.015810                0.00   
3      0.797801  0.450602   0.315456      0.0  0.152833                0.00   
4      0.412068  0.438554   0.303023      0.0  0.006588                0.00   
5      0.421486  0.462651   0.235739      0.0  0.027668                0.00   
6      0.751797  0.453012   0.209654      0.0  0.007905                0.00   
7      0.110680  0.443373   0.349586      0.0  0.009223                0.00   
8      0.860405  0.443373   0.223549      0.0  0.075099                0.00   
9      0.546616  0.481928   0.400536      0.0  0.188406                0.19   
10     0.402875  0.469880   0.174549      

Support Vector Machine Best Score: 0.8276844941956882
Best C: 0.001
Best tol: 1.0


In [26]:
clf = LinearSVC(C = gridcv.best_estimator_.C, 
                 tol=gridcv.best_estimator_.tol,
                 random_state=0)
clf.fit(X_train, y_train)

LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=1.0, verbose=0)

In [27]:
# Performance metrics
from sklearn import metrics
y_pred = clf.predict(X_test)

print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(y_test, y_pred)))

Classification report for classifier LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=1.0, verbose=0):
             precision    recall  f1-score   support

        0.0       0.81      0.95      0.88      5183
        1.0       0.93      0.73      0.82      4322

avg / total       0.86      0.85      0.85      9505




In [28]:
# Confusion matrix
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [5])  # 5 is value length
    empty_cell = " " * columnwidth
    # Print header
    print("    " + empty_cell, end=" ")
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            print(cell, end=" ")
        print()

print_cm(metrics.confusion_matrix(y_test, y_pred)/len(y_pred)*100, labels=['flooded','notFlooded'])

# Top row = predicted, left col = true

                  flooded notFlooded 
       flooded       52.0        2.5 
    notFlooded       12.1       33.3 


In [None]:
def compute_rmse(predictions, yvalues):
    rmse = math.sqrt((sum((predictions - yvalues)**2))/len(yvalues))
    return rmse

In [None]:
# # Benchmark training time
# start_time1 = time.time()
# time1 = str(round(time.time() - start_time1, 2))
# print(time1 + " seconds")