In [1]:
1+1

2

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus

import functions_used.functions_used as func



In [4]:
file_train = "~/ds/proj3/tanzania-water-wells/data/raw/training-set-values.csv" #x_train, training set values data
file_target_train = "~/ds/proj3/tanzania-water-wells/data/raw/training-labels.csv" #y_train, training set labels
file_test = "~/ds/proj3/tanzania-water-wells/data/raw/test-set.csv" #x_test, test set data

In [5]:
features = pd.read_csv(file_train)
targets = pd.read_csv(file_target_train)
X_test = pd.read_csv(file_test)

## Functions 

## Model 1: Decision Tree; Gini criterion

In [6]:
#note the target is listed in here

features_list = ['basin', 'region', 'scheme_management', 'scheme_name',
       'extraction_type', 'management', 'payment', 'water_quality', 'quantity',
       'source', 'waterpoint_type','gps_height', 'longitude', 'latitude', 
       'region_code', 'district_code', 'construction_year', 'status_group']   


In [7]:
ohe = OneHotEncoder(handle_unknown = 'ignore')

- Perform a train test split of the "training data" given in the problem. 
- Join the training data (X and y) together.


In [8]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, random_state=42)
joined_train = X_train.join(y_train, lsuffix='_l', rsuffix='_r')

- Perform the pre-processing. Clean the numerical data. Perform One Hot Encoding.

In [9]:
joined_train_processed, y_train = func.model_preprocessing(joined_train, features_list, ohe, train = True)

Beginning numerical cleaning...
check: df shape =  (44550, 18)
---Dropping 0 longitudes...
check: df shape =  (43211, 18)
---Replace 0's with average constructor year...
check: df shape =  (43211, 18)
...returning a cleaned dataframe of numerical values.
Completed numerical cleaning.

Removing the target from the cleaned data frame...
---Length of target:  43211
---Shape of dataframe:  (43211, 17)
Reading the remaining columns as independent features

Begining "object" cleaning...
---Replacing NaN with "unknown" bin...
---Check: Number of rows with nulls: 0...

Begin one hot encoding data...
Finish one hot encoding data...

---Shape of ohe_df:  (43211, 2572)
...ending "object" cleaning.
Joining the cleaned numerical and object dataframes together.
Returning the main (independent features, X) and target (y) data frames...


- Train the decision tree with training data.

In [10]:
dtc = DecisionTreeClassifier(random_state=42, max_depth=5) 
dtc.fit(joined_train_processed, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

- Repeat the pre-processing on the test data.
- Join the testing data (X and y) together.

In [None]:
joined_test = X_test.join(y_test, lsuffix='_l', rsuffix='_r')

- Preprocess the testing data.

In [None]:
joined_test_processed, y_test = model_preprocessing(joined_test, features_list, ohe, train=False)

- Make predictions with the test data.

In [None]:
predicts = dtc.predict(joined_test_processed)

- Check the score of the model.

In [None]:
dtc.score(joined_test_processed, y_test) #x_test, y_test from split

In [None]:
calc_accuracy(y_test, predicts)

In [None]:
calc_accuracy(y_test, predicts)

- Let's visualize this


In [None]:
joined_train_processed.columns


In [None]:
# need to begin with numerical feature names
feature_names = ['gps_height', 'longitude', 'latitude',
                'region_code', 'district_code',
                 'construction_year']
for name in ohe.categories_:
    for i in name:
        numerical_feature_names.append(i)

In [None]:
n_nodes = dtc.tree_.node_count
children_left = dtc.tree_.children_left
children_right = dtc.tree_.children_right
feature = dtc.tree_.feature
threshold = dtc.tree_.threshold

# This code courtesy of sklearn:
# https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html


# The tree structure can be traversed to compute various properties such
# as the depth of each node and whether or not it is a leaf.
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)]  # seed is the root node id and its parent depth
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1

    # If we have a test node
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

print("The binary tree structure has %s nodes and has "
      "the following tree structure:"
      % n_nodes)
for i in range(n_nodes):
    if is_leaves[i]:
        print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
    else:
        print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
              "node %s."
              % (node_depth[i] * "\t",
                 i,
                 children_left[i],
                 numerical_feature_names[int(feature[i])],
                 threshold[i],
                 children_right[i],
                 ))
print()

In [None]:
dot_file = StringIO()

export_graphviz(dtc, out_file=dot_file, filled=True,
               rounded=True)

image=pydotplus.graph_from_dot_data(dot_file.getvalue())
Image(image.create_png())

In [None]:
numerical_feature_names[19]

## Model #2

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 6, 10]
}
rfc = RandomForestClassifier(n_estimators=10, random_state=1, bootstrap=True)
gs = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=50, max_depth = 50, random_state=42, bootstrap=True, class_weight='balanced')

In [None]:
rfc.fit(joined_train_processed, y_train)

In [None]:
predicts = rfc.predict(joined_test_processed)

In [None]:
rfc.score(joined_test_processed, y_test)

In [None]:
calc_accuracy(y_test,predicts)

# Model 3 - KNN

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import euclidean as euc
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
# from visualize import generate_moons_df, preprocess, plot_boundaries

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
np.random.seed(0)

In [None]:
features_list = ['basin', 'region', 'scheme_management', 'scheme_name',
       'extraction_type', 'management', 'payment', 'water_quality', 'quantity',
       'source', 'waterpoint_type','gps_height', 'longitude', 'latitude', 
       'region_code', 'district_code', 'population', 'construction_year', 'status_group']   


In [None]:
scaler = StandardScaler()
scaler.fit(joined_train_processed)
X_train_scaled = scaler.transform(joined_train_processed)
X_test_scaled = scaler.transform(joined_test_processed)

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(joined_train_processed, y_train)

In [None]:
knn.fit(X_train_scaled,y_train)
sk_preds = knn.predict(X_test_scaled)

In [None]:
knn.score(X_test_scaled,y_test)

In [None]:
calc_accuracy(y_test,sk_preds)

# Model 4 - Multinomial Logistic Regression

In [None]:
logreg = LogisticRegression(solver='newton-cg', max_iter=10, random_state=42, multi_class = 'multinomial')

logreg.fit(joined_train_processed, y_train)

In [None]:
confusion_matrix(y_test, logreg.predict(joined_test_processed))