*Connecting to SQL Database and Loading data table in as dataframe*

In [1]:
# Import Dependencies
import numpy as np
import matplotlib.pyplot as plt
from google.colab import files
import io
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
from collections import Counter

  from IPython.utils import traitlets as _traitlets


In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

In [3]:
# Installing packages to use postgresql
!pip install ipython-sql
!pip install sqlalchemy

Collecting ipython-sql
  Downloading ipython_sql-0.4.1-py3-none-any.whl (21 kB)
Collecting prettytable<1
  Downloading prettytable-0.7.2.zip (28 kB)
Collecting sqlparse
  Downloading sqlparse-0.4.3-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.5 MB/s eta 0:00:01
Building wheels for collected packages: prettytable
  Building wheel for prettytable (setup.py) ... [?25ldone
[?25h  Created wheel for prettytable: filename=prettytable-0.7.2-py3-none-any.whl size=13714 sha256=5e58161bbf1d41b7a9027c34ff7820153d05107fb7a94379496a06ae9d9f26d4
  Stored in directory: /Users/dang/Library/Caches/pip/wheels/b2/7f/f6/f180315b584f00445045ff1699b550fa895d09471337ce21c6
Successfully built prettytable
Installing collected packages: sqlparse, prettytable, ipython-sql
Successfully installed ipython-sql-0.4.1 prettytable-0.7.2 sqlparse-0.4.3


In [4]:
# dependencies necessary for connecting to sql database
import os
from sqlalchemy import create_engine
from config import db_password

ModuleNotFoundError: No module named 'config'

In [None]:
db_string = f"postgresql://root:{db_password}@unc-capstone-db.chbhjul7q0jr.us-east-2.rds.amazonaws.com/cleaning_database_beta"
In [82]:


In [None]:
engine = create_engine(db_string)

In [None]:
!pip install ipython-sql

In [None]:
%load_ext sql

In [None]:
%sql postgresql://root:{db_password}@unc-capstone-db.chbhjul7q0jr.us-east-2.rds.amazonaws.com/cleaning_database_beta

In [None]:
%%sql 
SELECT * FROM updated_animal_data1 LIMIT 5

In [None]:
# Convert sql data table into pandas dataframe
animal_Data_df = pd.read_sql('SELECT * FROM updated_animal_data1', engine)
animal_Data_df.head()

In [None]:
# checking columns and data types
animal_Data_df.dtypes


**Pre-processing data for supervised learning**

In [None]:
# Removing columns that we do not need for model
u1_animalData_df = animal_Data_df.drop(['animal_id', 'state', 'sex', 'animal_type', 'breed_class', 'color'], axis=1)
print(u1_animalData_df.shape[0])
u1_animalData_df.head()

In [None]:
# Remove the `Not Tested` 4Dx status
not_Tested_mask = u1_animalData_df['is_4dx_tested'] != 'Not Tested'
tested_df = u1_animalData_df.loc[not_Tested_mask]

print(tested_df.shape[0])
print(tested_df.columns)
tested_df.head(10)

In [None]:
# Checking the values in the RR column - before converting to numerical
tested_df['resp_rate_bpm'].unique()

In [None]:
# Converting RR column to integer
tested_df['resp_rate_bpm'] = tested_df['resp_rate_bpm'].astype('int')

tested_df.dtypes

In [None]:
#Converting the age column to just numbers - years
# First, splitting the column into two new colummns - one for number and the other for the string (years, months, etc.)
tested_df[['age_num', 'age_str']] = tested_df['age'].apply(lambda x: pd.Series(str(x).split(" ")))

# Setting the original age column equal to the age_num column - i.e. changing the data in the age column to just the numbers
tested_df['age'] = tested_df['age_num']

# Changing the data type of the age column to float
tested_df['age'] = tested_df['age'].astype('float')

In [None]:
 # Checking the column to see what the unique string values are
tested_df["age_str"].unique()

In [None]:
# Using the string column to change the number value - converting all ages to years
tested_df.loc[tested_df.age_str == 'days', ['age']] = tested_df['age'] / 365
tested_df.loc[tested_df.age_str == 'day', ['age']] = tested_df['age'] / 365
tested_df.loc[tested_df.age_str == 'months', ['age']] = tested_df['age'] / 12
tested_df.loc[tested_df.age_str == 'month', ['age']] = tested_df['age'] / 12
tested_df.loc[tested_df.age_str == 'weeks', ['age']] = tested_df['age'] / 52
tested_df.loc[tested_df.age_str == 'week', ['age']] = tested_df['age'] / 52

print(tested_df.shape)
tested_df.head()

In [None]:
# Dropping the extra (created) columns - age_num and age_str
final_animalData_df = tested_df.drop(['age_str', 'age_num'], axis=1)
print(final_animalData_df.columns)
print(final_animalData_df.dtypes)
print(final_animalData_df.shape)
final_animalData_df.head()

*Defining our Target*

In [None]:
# Creating features
X = final_animalData_df.drop(columns='is_4dx_tested')
X = pd.get_dummies(X)

# Target
y = final_animalData_df['is_4dx_tested']

X.head()

In [None]:
X.describe()

In [None]:
# Check the balance of the target variable
y.value_counts()

**Scaling the data and Splitting our data into Training and Testing sets**

In [None]:
# Splitting data into testing and training sets

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(Counter(y_train))
print(Counter(y_test))

In [None]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

**Random Oversampling Model**

In [None]:
# Resampling the data with the RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

In [None]:
# Training the model with resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculating an accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Creating the confusion matrix
Confusion_matrix = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(Confusion_matrix, index=["Actual Negative", "Actual Positive"], columns=["Predicted Negative", "Predicted Positive"])
cm_df

In [None]:
# Displaying the classification report
report_df = pd.DataFrame(classification_report_imbalanced(y_test, y_pred, output_dict=True)).transpose()
report_df

**SMOTE Oversampling Model**

In [None]:
# Resampling data with SMOTE method
X_resampled, y_resampled = SMOTE(random_state=42, sampling_strategy='auto').fit_resample(
    X_train, y_train)

Counter(y_resampled)

In [None]:
# Training the logistic regression model using the SMOTE resampled data
model_SMOTE = LogisticRegression(solver='lbfgs', random_state=42)
model_SMOTE.fit(X_resampled, y_resampled)

In [None]:
# Calculating the accuracy score
y_pred = model_SMOTE.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Creating the confusion matrix
cm_SMOTE = confusion_matrix(y_test, y_pred)
cm_df2 = pd.DataFrame(cm_SMOTE, index=["Actual Negative", "Actual Positive"], columns=["Predicted Negative", "Predicted Positive"])
cm_df2

In [None]:
# Displaying the classification table
report_df2 = pd.DataFrame(classification_report_imbalanced(y_test, y_pred, output_dict=True)).transpose()
report_df2

**Uploading the updated table to SQL**

# Using label encoding to transform data in table that is better formatted for analysis in R
# printing the classes for each variable to show what level will be assigned - the idex in the list is the level assigned to that descriptor
og_final_animalData = final_animalData_df.copy()

le = LabelEncoder()
final_animalData_df['mm'] = le.fit_transform(final_animalData_df['mm'])
print(le.classes_)

final_animalData_df['crt'] = le.fit_transform(final_animalData_df['crt'])
print(le.classes_)

final_animalData_df['mentation'] = le.fit_transform(final_animalData_df['mentation'])
print(le.classes_)

final_animalData_df['diarrhea'] = le.fit_transform(final_animalData_df['diarrhea'])
print(le.classes_)

final_animalData_df['vomiting'] = le.fit_transform(final_animalData_df['vomiting'])
print(le.classes_)

final_animalData_df['inappetence'] = le.fit_transform(final_animalData_df['inappetence'])
print(le.classes_)

final_animalData_df['lethargic'] = le.fit_transform(final_animalData_df['lethargic'])
print(le.classes_)

final_animalData_df['muscle_pain'] = le.fit_transform(final_animalData_df['muscle_pain'])
print(le.classes_)

final_animalData_df['lameness'] = le.fit_transform(final_animalData_df['lameness'])
print(le.classes_)

final_animalData_df['reported_weight_loss'] = le.fit_transform(final_animalData_df['reported_weight_loss'])
print(le.classes_)

final_animalData_df['joint_swelling'] = le.fit_transform(final_animalData_df['joint_swelling'])
print(le.classes_)

final_animalData_df['skin_condition'] = le.fit_transform(final_animalData_df['skin_condition'])
print(le.classes_)

final_animalData_df['is_4dx_tested'] = le.fit_transform(final_animalData_df['is_4dx_tested'])
print(le.classes_)


# Adding code to write the cleaned dataframe to a new table in our PostgreSQL database
final_animalData_df.to_sql(name='ml_cleaned_animalData', con=engine)

final_animalData_df.head(20)