In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score 
from sklearn.metrics import mean_squared_error


In [2]:
#Read DataSet off of computer
df= pd.read_csv('C:/Users/jefft/Desktop/WGU Classes/D209/medical_clean.csv', index_col=0)

In [3]:
# Convert specified columns from string to category
df["TimeZone"] = df["TimeZone"].astype("category")
df["Marital"] = df["Marital"].astype("category")
df["Gender"] = df["Gender"].astype("category")
df["Initial_admin"] = df["Initial_admin"].astype("category")
df["Complication_risk"] = df["Complication_risk"].astype("category")
df["Services"] = df["Services"].astype("category")

# Reformat the 'Income' column from float to integer
df["Income"] = df["Income"].astype(int)

# Reformat currency columns to 3 decimal places
df["TotalCharge"] = df["TotalCharge"].round(3)
df["Additional_charges"] = df["Additional_charges"].round(3)

# Define mapping for boolean conversion
bool_mapping = {"Yes": 1, "No": 0}

# Convert specified columns from string to boolean using the mapping
boolean_columns = ["ReAdmis", "Soft_drink", "HighBlood", "Stroke", "Overweight", 
                   "Arthritis", "Diabetes", "Hyperlipidemia", "BackPain", "Anxiety", 
                   "Allergic_rhinitis", "Reflux_esophagitis", "Asthma"]

for col in boolean_columns:
    df[col] = df[col].map(bool_mapping)

# Define mapping to reverse survey scores
survey_mapping = {1: 8, 2: 7, 3: 6, 4: 5, 5: 4, 6: 3, 7: 2, 8: 1}

# Define ordered categorical datatype for survey responses
survey_scores = CategoricalDtype(categories=["1", "2", "3", "4", "5", "6", "7", "8"], ordered=True)

# Reverse and convert survey score columns to ordered categorical datatype
survey_columns = ["Item1", "Item2", "Item3", "Item4", "Item5", "Item6", "Item7", "Item8"]

for col in survey_columns:
    df[col] = df[col].map(survey_mapping).map(str).astype(survey_scores)

In [7]:
# List of binary columns to create dummies for
binary_columns = ['Gender', 'Arthritis', 'Diabetes', 'Hyperlipidemia', 'Allergic_rhinitis', 'Asthma', 'Anxiety']

# Create dummy variables and concatenate them back into the original DataFrame
dummies = [pd.get_dummies(df[col], dtype=int, prefix=col, drop_first=True) for col in binary_columns]

# Concatenate original DataFrame with the new dummy variables
df = pd.concat([df] + dummies, axis=1)

# Drop the original binary columns
df.drop(binary_columns, axis=1, inplace=True)

# Display the updated DataFrame
print(df)

          Customer_id                           Interaction  \
CaseOrder                                                     
1             C412403  8cd49b13-f45a-4b47-a2bd-173ffa932c2f   
2             Z919181  d2450b70-0337-4406-bdbb-bc1037f1734c   
3             F995323  a2057123-abf5-4a2c-abad-8ffe33512562   
4             A879973  1dec528d-eb34-4079-adce-0d7a40e82205   
5             C544523  5885f56b-d6da-43a3-8760-83583af94266   
...               ...                                   ...   
9996          B863060  a25b594d-0328-486f-a9b9-0567eb0f9723   
9997          P712040  70711574-f7b1-4a17-b15f-48c54564b70f   
9998          R778890  1d79569d-8e0f-4180-a207-d67ee4527d26   
9999          E344109  f5a68e69-2a60-409b-a92f-ac0847b27db0   
10000         I569847  bc482c02-f8c9-4423-99de-3db5e62a18d5   

                                        UID          City State        County  \
CaseOrder                                                                       
1          3a83ddb

In [103]:
dummies

[           Gender_Male  Gender_Nonbinary
 CaseOrder                               
 1                    1                 0
 2                    0                 0
 3                    0                 0
 4                    1                 0
 5                    0                 0
 ...                ...               ...
 9996                 1                 0
 9997                 1                 0
 9998                 0                 0
 9999                 1                 0
 10000                0                 0
 
 [10000 rows x 2 columns],
            Arthritis_1
 CaseOrder             
 1                    1
 2                    0
 3                    0
 4                    1
 5                    0
 ...                ...
 9996                 0
 9997                 1
 9998                 0
 9999                 0
 10000                1
 
 [10000 rows x 1 columns],
            Diabetes_1
 CaseOrder            
 1                   1
 2             

In [9]:
#Summary Statistics
df.Initial_days.describe()

count    10000.000000
mean        34.455299
std         26.309341
min          1.001981
25%          7.896215
50%         35.836244
75%         61.161020
max         71.981490
Name: Initial_days, dtype: float64

In [11]:
df.Initial_days.value_counts()

Initial_days
63.544320    2
67.421390    2
70.325420    2
63.334690    1
67.036510    1
            ..
5.977596     1
5.799041     1
6.415853     1
7.328631     1
70.850590    1
Name: count, Length: 9997, dtype: int64

In [19]:
df.Asthma_1.describe()

count    10000.00000
mean         0.28930
std          0.45346
min          0.00000
25%          0.00000
50%          0.00000
75%          1.00000
max          1.00000
Name: Asthma_1, dtype: float64

In [21]:
df.Asthma_1.value_counts

<bound method IndexOpsMixin.value_counts of CaseOrder
1        1
2        0
3        0
4        1
5        0
        ..
9996     0
9997     1
9998     0
9999     0
10000    0
Name: Asthma_1, Length: 10000, dtype: int32>

In [23]:
df. Allergic_rhinitis_1.describe()

count    10000.000000
mean         0.394100
std          0.488681
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: Allergic_rhinitis_1, dtype: float64

In [25]:
df.  Allergic_rhinitis_1.value_counts

<bound method IndexOpsMixin.value_counts of CaseOrder
1        1
2        0
3        0
4        0
5        1
        ..
9996     0
9997     0
9998     1
9999     0
10000    1
Name: Allergic_rhinitis_1, Length: 10000, dtype: int32>

In [27]:
df.Anxiety_1.describe()

count    10000.000000
mean         0.321500
std          0.467076
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: Anxiety_1, dtype: float64

In [29]:
df.Anxiety_1.value_counts()

Anxiety_1
0    6785
1    3215
Name: count, dtype: int64

In [41]:
df.Hyperlipidemia_1.describe()

count    10000.000000
mean         0.337200
std          0.472777
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: Hyperlipidemia_1, dtype: float64

In [43]:
df.Hyperlipidemia_1.value_counts()

Hyperlipidemia_1
0    6628
1    3372
Name: count, dtype: int64

In [45]:
df.Diabetes_1.describe()

count    10000.00000
mean         0.27380
std          0.44593
min          0.00000
25%          0.00000
50%          0.00000
75%          1.00000
max          1.00000
Name: Diabetes_1, dtype: float64

In [47]:
df.Diabetes_1.value_counts()

Diabetes_1
0    7262
1    2738
Name: count, dtype: int64

In [93]:
df.Arthritis_1.describe()

count    10000.000000
mean         0.357400
std          0.479258
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: Arthritis_1, dtype: float64

In [95]:
df.Arthritis_1.value_counts()

Arthritis_1
0    6426
1    3574
Name: count, dtype: int64

In [53]:
df.Children.describe()

count    10000.000000
mean         2.097200
std          2.163659
min          0.000000
25%          0.000000
50%          1.000000
75%          3.000000
max         10.000000
Name: Children, dtype: float64

In [55]:
df.Children.value_counts()

Children
0     2548
1     2509
3     1489
2     1475
4      995
7      213
8      209
6      191
5      169
9      108
10      94
Name: count, dtype: int64

In [57]:
df.Age.describe()

count    10000.000000
mean        53.511700
std         20.638538
min         18.000000
25%         36.000000
50%         53.000000
75%         71.000000
max         89.000000
Name: Age, dtype: float64

In [59]:
df.Age.value_counts()

Age
47    161
52    159
74    159
41    157
86    156
     ... 
63    123
51    122
20    120
36    118
80    116
Name: count, Length: 72, dtype: int64

In [65]:
df.Gender_Male.describe()

count    10000.000000
mean         0.476800
std          0.499486
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: Gender_Male, dtype: float64

In [67]:
df.Gender_Nonbinary.value_counts()

Gender_Nonbinary
0    9786
1     214
Name: count, dtype: int64

In [69]:
df.VitD_levels.describe()

count    10000.000000
mean        17.964262
std          2.017231
min          9.806483
25%         16.626439
50%         17.951122
75%         19.347963
max         26.394449
Name: VitD_levels, dtype: float64

In [71]:
df.VitD_levels.value_counts()

VitD_levels
18.135431    2
15.939760    2
17.821860    2
20.184170    2
18.741340    2
            ..
18.825293    1
16.849021    1
15.111106    1
20.583694    1
18.388620    1
Name: count, Length: 9976, dtype: int64

In [73]:
df.Doc_visits.describe()

count    10000.000000
mean         5.012200
std          1.045734
min          1.000000
25%          4.000000
50%          5.000000
75%          6.000000
max          9.000000
Name: Doc_visits, dtype: float64

In [75]:
df.Doc_visits.value_counts()

Doc_visits
5    3823
6    2436
4    2385
7     634
3     595
8      61
2      58
1       6
9       2
Name: count, dtype: int64

In [77]:
df.Full_meals_eaten.describe()

count    10000.000000
mean         1.001400
std          1.008117
min          0.000000
25%          0.000000
50%          1.000000
75%          2.000000
max          7.000000
Name: Full_meals_eaten, dtype: float64

In [79]:
df.Full_meals_eaten.value_counts()

Full_meals_eaten
0    3715
1    3615
2    1856
3     612
4     169
5      25
6       6
7       2
Name: count, dtype: int64

In [126]:
# Creating data frame of variables that weren't categorical and didn't need dummy variables
features = df[["Children", "Age", "VitD_levels", "Doc_visits","Full_meals_eaten","Initial_days",'Gender_Male','Gender_Nonbinary', 'Arthritis_1', 'Diabetes_1', 'Hyperlipidemia_1', 'Allergic_rhinitis_1', 'Asthma_1']]
# Assuming Arthritis_1 and other variables are defined elsewhere
features = pd.concat([num_features] + dummies, axis=1)
X = features
y = df['Anxiety_1'].copy()
X.to_csv('d209task2_full_clean.csv', index=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
pd.DataFrame(X_train).to_csv("x_train1.csv", sep=',', index=False)
pd.DataFrame(X_test).to_csv("X_test1.csv", sep=',', index=False)
pd.DataFrame(y_train).to_csv("y_train1.csv", sep=',', index=False)
pd.DataFrame(y_test).to_csv("y_test1.csv", sep=',', index=False)


In [None]:
# Hyperparameter tuning for Decision Tree
# Define parameter grid for Decision Tree
parameter_grid = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf': [0.04, 0.06, 0.08, 0.10, 0.12, 0.14, 0.16, 0.18, 0.20, 0.22]
}

# Initialize Decision Tree
initial_decision_tree = DecisionTreeClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_decision_tree = GridSearchCV(estimator=initial_decision_tree,
                                         param_grid=parameter_grid,
                                         scoring='roc_auc',
                                         cv=5,
                                         n_jobs=-1)

# Fit GridSearchCV to training data
grid_search_decision_tree.fit(X_train, y_train)

# Get the best estimator
best_decision_tree = grid_search_decision_tree.best_estimator_

# Predict labels for test set
initial_predictions = best_decision_tree.predict(X_test)

# Calculate accuracy for the model
accuracy = accuracy_score(y_test, initial_predictions)
print('Test set accuracy of the best Decision Tree: {:.2f}'.format(accuracy)) 

# Predict probabilities of the positive class for the test set
initial_probabilities = best_decision_tree.predict_proba(X_test)[:, 1]

# Calculate ROC AUC score for the test set
roc_auc = roc_auc_score(y_test, initial_probabilities)

# Print ROC AUC score
print('Test set ROC AUC score: {:.3f}'.format(roc_auc))

In [None]:
best_decision_tree

In [None]:
# Hyperparameter tuning for AdaBoost
# Define parameter grid for AdaBoost
ada_parameters = {
    'n_estimators': [160, 180, 200],
    'learning_rate': [0.9, 1.0, 1.1, 1.2, 1.3]
}

# Initialize a Decision Tree with specified hyperparameters
base_tree = DecisionTreeClassifier(max_depth=3, min_samples_leaf=0.08, random_state=42)

# Initialize the AdaBoost classifier using the decision tree
ada_classifier = AdaBoostClassifier(base_estimator=base_tree, random_state=42)

# Set up GridSearchCV for AdaBoost
grid_search_ada = GridSearchCV(estimator=ada_classifier,
                               param_grid=ada_parameters,
                               scoring='roc_auc',
                               cv=5,
                               n_jobs=-1)

# Fit the grid search to the training data
grid_search_ada.fit(X_train, y_train)

# Display the best parameters found by the grid search
print(grid_search_ada.best_params_)

In [None]:
# Initialize a Decision Tree with tuned hyperparameters
final_decision_tree = DecisionTreeClassifier(max_depth=3, min_samples_leaf=0.08, random_state=42)
final_ada_boost = AdaBoostClassifier(base_estimator=final_decision_tree, n_estimators=200, learning_rate=1.3, random_state=42)
final_ada_boost.fit(X_train, y_train)
final_predictions = final_ada_boost.predict(X_test)
final_accuracy = accuracy_score(y_test, final_predictions)
print('Final Decision Tree Model Test Accuracy: {:.2f}'.format(final_accuracy))
final_probabilities = final_ada_boost.predict_proba(X_test)[:,1]
final_roc_auc = roc_auc_score(y_test, final_probabilities)
final_confusion_matrix = confusion_matrix(y_test, final_predictions)
print("\nConfusion Matrix for AdaBoosted Decision Tree:")
print("Predicted No anxiety| Predicted Anxiety")
print(f"                   {final_confusion_matrix[0]}  No Anxiety")
print(f"                   {final_confusion_matrix[1]}  Anxiety\n")

# Calculate mean squared error and root mean squared error
final_mse = mean_squared_error(y_test, final_predictions)
final_root_mse = final_mse ** (1/2)
print(f"Mean Squared Error: {final_mse}")
print(f"Root Mean Squared Error: {round(final_root_mse, 2)}\n\n")

# Generate classification report
print(classification_report(y_test, final_predictions))

# Display the AUC score of the final model
print('\nArea Under the Curve (AUC) Score of Final Decision Tree Model: {:.2f}'.format(final_roc_auc))

In [None]:
# Generate ROC plot and calculate AUC score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, final_predictions)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(false_positive_rate, true_positive_rate)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for AdaBoosted Classification Model')
plt.show()

# Display the AUC score of the final model
print('\nArea Under the Curve (AUC) Score of Final Decision Tree Model: {:.2f}'.format(final_roc_auc))