In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import psycopg2
import sqlalchemy

%matplotlib inline

In [30]:
from sqlalchemy import create_engine
# Postgres username, password, and database name
POSTGRES_ADDRESS = 'localhost'
POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'postgres'
POSTGRES_PASSWORD = 'Sparky89!'
POSTGRES_DBNAME = 'C964'
# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
   .format(username=POSTGRES_USERNAME,
   password=POSTGRES_PASSWORD,
   ipaddress=POSTGRES_ADDRESS,
   port=POSTGRES_PORT,
   dbname=POSTGRES_DBNAME))
# Create the connection
cnx = create_engine(postgres_str)

In [31]:
thyroid = pd.read_sql_query('''SELECT * FROM thyroid;''', cnx)

In [32]:
thyroid = pd.DataFrame(thyroid)

## Convert data to numbers

In [33]:
# Covert DX 
thyroid=thyroid.replace({"negative":0, "hypothyroid":1})   

In [34]:
# Drop rows where no result is given for sex
thyroid = thyroid[thyroid.sex != '?']

In [35]:
# Covert Sex 
thyroid=thyroid.replace({"M":0,"F":1})   

In [36]:
# Convert f/t
thyroid=thyroid.replace({"f":0,"t":1})

In [37]:
# Convert y/n
thyroid=thyroid.replace({"n":0,"y":1})

In [38]:
thyroid.dtypes

dx                            int64
age                          object
sex                           int64
on_thyroxine                  int64
query_on_thyroxine            int64
on_antithyroid_medication     int64
thyroid_surgery               int64
query_hypothyroid             int64
query_hyperthyroid            int64
pregnant                      int64
sick                          int64
tumor                         int64
lithium                       int64
goitre                        int64
tsh_measured                  int64
tsh                          object
t3_measured                   int64
t3                           object
tt4_measured                  int64
tt4                          object
t4u_measured                  int64
t4u                          object
fti_measured                  int64
fti                          object
tbg_measured                  int64
tbg                          object
dtype: object

## Drop unneeded columns

In [39]:
# Drop tbg due to too many NAN values

In [40]:
thyroid.drop('tbg', axis=1, inplace=True)
thyroid.drop('tbg_measured', axis=1, inplace=True)

In [41]:
# Drop following columns due to lack of significance
thyroid.drop('query_on_thyroxine', axis=1, inplace=True)
thyroid.drop('query_hypothyroid', axis=1, inplace=True)
thyroid.drop('query_hyperthyroid', axis=1, inplace=True)

## Convert remaining columns to desired datatypes

In [42]:
cols = thyroid.columns[thyroid.dtypes.eq('object')]

In [43]:
thyroid[cols] = thyroid[cols].apply(pd.to_numeric, errors='coerce')

## Filling NA values

In [44]:
# Get mean values of columns with missing data. This will be used if provider does not provide test results for tsh, t3, tt4, t4u, fti
thyroid.mean()

dx                             0.048220
age                           51.220000
sex                            0.706149
on_thyroxine                   0.147573
on_antithyroid_medication      0.013592
thyroid_surgery                0.033010
pregnant                       0.020065
sick                           0.031715
tumor                          0.012945
lithium                        0.000647
goitre                         0.031715
tsh_measured                   0.852427
tsh                            5.938379
t3_measured                    0.778964
t3                             1.931367
tt4_measured                   0.921036
tt4                          108.426528
t4u_measured                   0.921359
t4u                            0.985198
fti_measured                   0.921683
fti                          111.721524
dtype: float64

In [45]:
# Splitting up data
X = thyroid.drop('dx', axis=1)
y = thyroid['dx']

In [46]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill values with mean
num_imputer = SimpleImputer(strategy='mean')

# Define columns
num_features = ['age', 'tsh', 't3', 'tt4', 't4u', 'fti']

# Create imputer
imputer = ColumnTransformer([('num_imputer', num_imputer, num_features)],
                            remainder="passthrough")

#Transform the data
X = imputer.fit_transform(X)
X = pd.DataFrame(X)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,72.0,30.0,0.6,15.0,1.48,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
1,15.0,145.0,1.7,19.0,1.13,17.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
2,24.0,0.0,0.2,4.0,1.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
3,24.0,430.0,0.4,6.0,1.04,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
4,77.0,7.3,1.2,57.0,1.28,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3085,58.0,5.8,1.7,86.0,0.91,95.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
3086,29.0,0.8,1.8,99.0,1.01,98.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
3087,77.0,1.2,0.6,71.0,0.68,104.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
3088,74.0,1.3,0.1,65.0,0.48,137.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0


In [47]:
X.dtypes

0     float64
1     float64
2     float64
3     float64
4     float64
5     float64
6     float64
7     float64
8     float64
9     float64
10    float64
11    float64
12    float64
13    float64
14    float64
15    float64
16    float64
17    float64
18    float64
19    float64
dtype: object

## Modeling

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [49]:
# Model
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

clf.fit(X_train, y_train)

RandomForestClassifier()

In [50]:
# Make a predicition
y_preds = clf.predict(X_test)
y_preds

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [51]:
# Eval model
clf.score(X_test, y_test)

0.982200647249191

In [52]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       596
           1       0.76      0.73      0.74        22

    accuracy                           0.98       618
   macro avg       0.88      0.86      0.87       618
weighted avg       0.98      0.98      0.98       618



In [53]:
confusion_matrix(y_test, y_preds)

array([[591,   5],
       [  6,  16]], dtype=int64)

In [54]:
accuracy_score(y_test, y_preds)

0.982200647249191

In [55]:
X_test.shape

(618, 20)

In [56]:
import pickle

pickle.dump(clf, open("clf_model.pkl", "wb"))