# 0. Import Data

In [91]:
# https://pbpython.com/categorical-encoding.html
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflowjs as tfjs
import seaborn as sns
import numpy as np

In [92]:
df = pd.read_csv('survey lung cancer RAW.csv')

In [93]:
df.dtypes

GENDER                   object
AGE                       int64
SMOKING                   int64
YELLOW_FINGERS            int64
ANXIETY                   int64
PEER_PRESSURE             int64
CHRONIC DISEASE           int64
FATIGUE                   int64
ALLERGY                   int64
WHEEZING                  int64
ALCOHOL CONSUMING         int64
COUGHING                  int64
SHORTNESS OF BREATH       int64
SWALLOWING DIFFICULTY     int64
CHEST PAIN                int64
LUNG_CANCER              object
dtype: object

In [94]:
# obj_df = df.select_dtypes(include=['object']).copy()
obj_df = df.copy()
obj_df[obj_df.isnull().any(axis=1)]
obj_df.head()


Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [95]:
obj_df["LUNG_CANCER"].value_counts()

YES    270
NO     197
Name: LUNG_CANCER, dtype: int64

In [96]:
cleanup_nums = {
                "GENDER":     {"M": 1, "F": 2},
                "LUNG_CANCER": {"YES": 1, "NO": 0 }
                }

obj_df = obj_df.replace(cleanup_nums)
obj_df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,1,69,1,2,2,1,1,2,1,2,2,2,2,2,2,1
1,1,74,2,1,1,1,2,2,2,1,1,1,2,2,2,1
2,2,59,1,1,1,2,1,2,1,2,1,2,2,1,2,0
3,1,63,2,2,2,1,1,1,1,1,2,1,1,2,2,0
4,2,63,1,2,1,1,1,1,1,2,1,2,2,1,1,0


In [97]:

X = pd.get_dummies(obj_df.drop(['LUNG_CANCER'], axis=1))
y = obj_df['LUNG_CANCER']
# y = df['LUNG_CANCER'].apply(lambda x: 1 if x=='YES' else 0)
# X

In [98]:
obj_df.tail()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
462,1,62,1,1,1,1,1,1,1,1,1,1,1,1,1,0
463,1,62,1,1,1,1,1,1,1,1,1,1,1,1,1,0
464,1,62,1,1,1,1,1,1,1,1,1,1,1,1,1,0
465,1,62,1,1,1,1,1,1,1,1,1,1,1,1,1,0
466,1,62,1,1,1,1,1,1,1,1,1,1,1,1,1,0


In [99]:
train_dataset = obj_df.sample(frac=0.8, random_state=0)
test_dataset = obj_df.drop(train_dataset.index)

In [100]:
# sns.pairplot(train_dataset[['tenure', 'Monthly Charges']], diag_kind='kde')
obj_df.dtypes

GENDER                   int64
AGE                      int64
SMOKING                  int64
YELLOW_FINGERS           int64
ANXIETY                  int64
PEER_PRESSURE            int64
CHRONIC DISEASE          int64
FATIGUE                  int64
ALLERGY                  int64
WHEEZING                 int64
ALCOHOL CONSUMING        int64
COUGHING                 int64
SHORTNESS OF BREATH      int64
SWALLOWING DIFFICULTY    int64
CHEST PAIN               int64
LUNG_CANCER              int64
dtype: object

In [101]:
train_dataset.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
GENDER,374.0,1.323529,0.468449,1.0,1.0,1.0,2.0,2.0
AGE,374.0,62.280749,6.803548,21.0,60.0,62.0,64.0,81.0
SMOKING,374.0,1.382353,0.486613,1.0,1.0,1.0,2.0,2.0
YELLOW_FINGERS,374.0,1.377005,0.485285,1.0,1.0,1.0,2.0,2.0
ANXIETY,374.0,1.323529,0.468449,1.0,1.0,1.0,2.0,2.0
PEER_PRESSURE,374.0,1.342246,0.475097,1.0,1.0,1.0,2.0,2.0
CHRONIC DISEASE,374.0,1.331551,0.471401,1.0,1.0,1.0,2.0,2.0
FATIGUE,374.0,1.467914,0.499638,1.0,1.0,1.0,2.0,2.0
ALLERGY,374.0,1.360963,0.480923,1.0,1.0,1.0,2.0,2.0
WHEEZING,374.0,1.360963,0.480923,1.0,1.0,1.0,2.0,2.0


In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

# 1. Import Dependencies

In [103]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score
import tensorflow as tf

# 2. Build and Compile Model

In [104]:
normalizer = tf.keras.layers.Normalization(axis=-1)

In [105]:
model = Sequential([normalizer])
model.add(Dense(units=32, activation='relu', input_dim=len(X_train.columns)))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

In [106]:
first = np.array(X_train[:1])
first

array([[ 1, 62,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1]],
      dtype=int64)

In [107]:
print('Normalized:', normalizer(first).numpy())

Normalized: [[ 1. 62.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]]


In [108]:
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics='accuracy')

# 3. Fit, Predict and Evaluate

In [109]:
model.fit(X_train, y_train, epochs=200, batch_size=32)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x145faab47f0>

In [111]:
# y_hat = model.predict(np.array([1,50,1,1,1,1,1,1,1,1,1,1,2,2,2]))
y_hat = model.predict(X_test)
y_hat = [0 if val < 0.5 else 1 for val in y_hat]




In [114]:
val = np.array([1,62,1,1,1,1,1,1,1,1,1,1,1,1,1])
val.shape
prediction = model.predict(val) 
prediction



array([[0.2342272]], dtype=float32)

In [84]:
y_hat

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [85]:
accuracy_score(y_test, y_hat)

0.8870967741935484

# 4. Saving and Reloading

In [14]:
model.save('tfmodel')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: tfmodel/assets


In [15]:
del model 

In [16]:
model = load_model('tfmodel')

In [16]:
tfjs.converters.save_keras_model(model,'models')