In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import (
    cross_val_predict,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    FunctionTransformer,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
)

In [5]:
df = pd.read_csv("diabetes.csv")
train_df, test_df = train_test_split(df, random_state=111)
train_df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
367,40,Female,Yes,Yes,Yes,Yes,No,No,Yes,No,No,Yes,Yes,Yes,No,No,Positive
145,61,Male,Yes,No,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,No,No,Positive
258,35,Female,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,No,Yes,Yes,Yes,No,No,Positive
238,72,Male,No,No,No,No,Yes,No,Yes,Yes,No,Yes,No,Yes,Yes,No,Negative
517,58,Female,Yes,Yes,Yes,Yes,Yes,No,Yes,No,No,No,Yes,Yes,No,Yes,Positive


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 390 entries, 367 to 86
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Age                 390 non-null    int64 
 1   Gender              390 non-null    object
 2   Polyuria            390 non-null    object
 3   Polydipsia          390 non-null    object
 4   sudden weight loss  390 non-null    object
 5   weakness            390 non-null    object
 6   Polyphagia          390 non-null    object
 7   Genital thrush      390 non-null    object
 8   visual blurring     390 non-null    object
 9   Itching             390 non-null    object
 10  Irritability        390 non-null    object
 11  delayed healing     390 non-null    object
 12  partial paresis     390 non-null    object
 13  muscle stiffness    390 non-null    object
 14  Alopecia            390 non-null    object
 15  Obesity             390 non-null    object
 16  class               390 non-nu

In [7]:
numeric_features = ["Age"]
target_column = ["class"]
binary_features = list(
    set(train_df.columns)
    - set(numeric_features)
    - set(target_column)
)

In [8]:
preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(drop = "if_binary"), binary_features),
)

In [9]:
preprocessor.fit(train_df);

In [13]:
new_columns = (
    numeric_features
    + binary_features
)
print(new_columns)

['Age', 'Obesity', 'sudden weight loss', 'partial paresis', 'muscle stiffness', 'Polyuria', 'visual blurring', 'Alopecia', 'weakness', 'Genital thrush', 'Irritability', 'Polydipsia', 'Itching', 'Polyphagia', 'Gender', 'delayed healing']


In [14]:
X_train_enc = pd.DataFrame(
    preprocessor.transform(train_df), index=train_df.index, columns = new_columns
)
X_test_enc = pd.DataFrame(
    preprocessor.transform(train_df), index=train_df.index, columns = new_columns
)

In [15]:
X_train_enc.head()

Unnamed: 0,Age,Obesity,sudden weight loss,partial paresis,muscle stiffness,Polyuria,visual blurring,Alopecia,weakness,Genital thrush,Irritability,Polydipsia,Itching,Polyphagia,Gender,delayed healing
367,-0.697284,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
145,1.046992,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
258,-1.112588,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
238,1.96066,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
517,0.797809,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [None]:
# Helper function from CPSC330: Applied Machine Learning at UBC, available here:
# https://github.com/UBC-CS/cpsc330-2023W2/blob/main/hw/hw3/hw3.ipynb 

def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)