## Decision Trees

In [53]:
# Initial imports
import pandas as pd
from path import Path
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from collections import Counter

## Loading and Preprocessing Loans Encoded Data

In [54]:
import matplotlib.pyplot as plt
%matplotlib inline
from flask import Flask
from flask_sqlalchemy import SQLAlchemy
from os import environ
from sqlalchemy import create_engine
# Create flask to connect database 
app = Flask(__name__)
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = True
engine = create_engine("postgresql://June:Covid19*@covid19-readiness.cwom5umbekrh.us-west-1.rds.amazonaws.com/SPA_data")
# Create dataframe from PostgreSQL
df_ready = pd.read_sql_table('combined_data_cleaned',con=engine)

In [55]:
df_ready.drop(['region_name'], axis=1, inplace=True)
df_ready.drop(['region_code'], axis=1, inplace=True)
df_ready.drop(['region'], axis=1, inplace=True)
df_ready.drop(['numberbeds'], axis=1, inplace=True)
df_ready.head()

Unnamed: 0,index,facil,country,latnum,longnum,factype,mga,ftype,factype2,ownership,...,eye_protect,preparedness,tbservice,quality,inpatient,total_full,cumulative_cases,cumulative_deaths,preparedness2,prep2
0,0,11135,BAN,25.096353,89.619072,8,1,2,2,1,...,0,0.181818,0.0,0.0,0.0,3.0,377073,5500,0.25,0
1,1,11600,BAN,25.145923,89.935113,12,3,2,2,2,...,1,1.0,0.0,0.0,0.0,22.0,377073,5500,1.0,1
2,2,11599,BAN,25.015608,90.014368,12,3,1,2,2,...,1,1.0,0.0,1.0,1.0,12.0,377073,5500,1.0,1
3,3,11598,BAN,24.71694,90.953117,12,3,2,2,2,...,1,1.0,0.0,0.0,0.0,6.0,377073,5500,1.0,1
4,4,11597,BAN,24.739533,90.527265,12,3,2,2,2,...,0,0.727273,1.0,0.0,0.0,9.0,377073,5500,0.5,0


In [56]:
# df_ready = pd.DataFrame(df, columns=["country", "ownership", "ftype", "prep2", "quality", "inpatient", "st_precautions", "total_full", "tbservice", "water_source"])
df_ready = pd.DataFrame(df, columns=["prep2", "country", "ownership", "ftype", "quality", "inpatient", "st_precautions", "tbservice", "water_source"])
df_ready.head()

Unnamed: 0,prep2,country,ownership,ftype,quality,inpatient,st_precautions,tbservice,water_source
0,0,BAN,1,2,0.0,0.0,0,0.0,0
1,1,BAN,2,2,0.0,0.0,1,0.0,1
2,1,BAN,2,1,1.0,1.0,1,0.0,1
3,1,BAN,2,2,0.0,0.0,1,0.0,1
4,0,BAN,2,2,0.0,0.0,1,1.0,1


In [57]:
df_ready.dropna(inplace=True)
df_ready.head()

Unnamed: 0,prep2,country,ownership,ftype,quality,inpatient,st_precautions,tbservice,water_source
0,0,BAN,1,2,0.0,0.0,0,0.0,0
1,1,BAN,2,2,0.0,0.0,1,0.0,1
2,1,BAN,2,1,1.0,1.0,1,0.0,1
3,1,BAN,2,2,0.0,0.0,1,0.0,1
4,0,BAN,2,2,0.0,0.0,1,1.0,1


In [58]:
df_ready['country'] = le.fit_transform(df_ready['country'])

In [59]:
# Define features set
X = df_ready.copy()
X = X.drop("prep2", axis=1)
X.head()

Unnamed: 0,country,ownership,ftype,quality,inpatient,st_precautions,tbservice,water_source
0,0,1,2,0.0,0.0,0,0.0,0
1,0,2,2,0.0,0.0,1,0.0,1
2,0,2,1,1.0,1.0,1,0.0,1
3,0,2,2,0.0,0.0,1,0.0,1
4,0,2,2,0.0,0.0,1,1.0,1


In [60]:
# Define target vector
y = df_ready["prep2"].values.reshape(-1, 1)
y[:5]

array([[0],
       [1],
       [1],
       [1],
       [0]], dtype=int64)

In [61]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [62]:
## SMOTE Oversampling
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1).fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0: 3075, 1: 3075})

In [63]:
print(X_resampled.shape)
print(X_test.shape)
print(y_resampled.shape)
print(y_test.shape)

(6150, 8)
(1091, 8)
(6150,)
(1091, 1)


In [64]:
# X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [65]:
# print(X_train2.shape)
# print(X_test2.shape)
# print(y_train2.shape)
# print(y_test2.shape)

In [66]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [67]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_resampled)

In [68]:
# Scaling data
X_train_scaled = X_scaler.transform(X_resampled)
X_test_scaled = X_scaler.transform(X_test)


## Fitting the Decision Tree Model

In [69]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [70]:
# Fitting the model
model = model.fit(X_train_scaled, y_resampled)

## Making Predictions Using the Tree Model

In [71]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

## Model Evaluation

In [72]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [73]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,851,184
Actual 1,31,25


Accuracy Score : 0.8029330889092575
Classification Report
              precision    recall  f1-score   support

           0       0.96      0.82      0.89      1035
           1       0.12      0.45      0.19        56

    accuracy                           0.80      1091
   macro avg       0.54      0.63      0.54      1091
weighted avg       0.92      0.80      0.85      1091

