### No show

In [2]:
import pandas as pd
import kagglehub
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Step 1: Download the dataset using kagglehub
path = kagglehub.dataset_download("joniarroba/noshowappointments")
print("Path to dataset files:", path)

# Load the dataset
data = pd.read_csv(f"{path}/KaggleV2-May-2016.csv")  # Adjust filename based on the extracted dataset

# Check for missing values and drop them
print("Missing values per column:\n", data.isnull().sum())
data.dropna(inplace=True)

# Step 2: Feature extraction
# Extract relevant features
features = data[['Gender', 'Age', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received']]
target = data['No-show']

# Step 3: Preprocessing
# Encode categorical variables
encoder = LabelEncoder()
features['Gender'] = encoder.fit_transform(features['Gender'])
target = encoder.fit_transform(target)

# Scale numerical features
scaler = StandardScaler()
features[['Age', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received']] = scaler.fit_transform(
    features[['Age', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received']]
)

# Step 4: Split the data
X_train, X_temp, y_train, y_temp = train_test_split(features, target, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 5: Train a Decision Tree Classifier
dt_model = DecisionTreeClassifier(criterion='gini', random_state=42)
dt_model.fit(X_train, y_train)

# Evaluate Decision Tree on validation set
y_val_pred_dt = dt_model.predict(X_val)
print("Decision Tree Validation Accuracy:", accuracy_score(y_val, y_val_pred_dt))
print("Decision Tree Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred_dt))

# Step 6: Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate Random Forest on validation set
y_val_pred_rf = rf_model.predict(X_val)
print("Random Forest Validation Accuracy:", accuracy_score(y_val, y_val_pred_rf))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred_rf))

# Step 7: Test the Random Forest Classifier
y_test_pred_rf = rf_model.predict(X_test)
print("Random Forest Test Accuracy:", accuracy_score(y_test, y_test_pred_rf))
print("Random Forest Test Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_rf))


  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/joniarroba/noshowappointments?dataset_version_number=5...


100%|██████████| 2.40M/2.40M [00:00<00:00, 8.74MB/s]

Extracting files...





Path to dataset files: C:\Users\justino\.cache\kagglehub\datasets\joniarroba\noshowappointments\versions\5
Missing values per column:
 PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['Gender'] = encoder.fit_transform(features['Gender'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features[['Age', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received']] = scaler.fit_transform(


Decision Tree Validation Accuracy: 0.7946258934226001
Decision Tree Confusion Matrix:
 [[8746   59]
 [2211   37]]
Random Forest Validation Accuracy: 0.7937211616755632
Random Forest Confusion Matrix:
 [[8729   76]
 [2204   44]]
Random Forest Test Accuracy: 0.7994209716818963
Random Forest Test Confusion Matrix:
 [[8787   77]
 [2140   49]]
