In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib


In [13]:
data = pd.read_csv('KaggleV2-May-2016.csv')

In [14]:
data.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [15]:
data['ScheduledDay'] = pd.to_datetime(data['ScheduledDay'])
data['AppointmentDay'] = pd.to_datetime(data['AppointmentDay'])
data['DaysBetween'] = (data['AppointmentDay'] - data['ScheduledDay']).dt.days
encoder = LabelEncoder()
data['Gender'] = encoder.fit_transform(data['Gender'])
data['No-show'] = data['No-show'].map({'Yes': 1, 'No': 0})

# Prepare data for training, excluding the 'SMS_received' feature
X = data[['Gender', 'DaysBetween', 'Age', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap']]
y = data['No-show']
# Check class distribution
print("Class distribution before SMOTE:", y.value_counts())

# Balance data using SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)
print("Class distribution after SMOTE:", y_smote.value_counts())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, 'scaler.joblib')

Class distribution before SMOTE: No-show
0    88208
1    22319
Name: count, dtype: int64
Class distribution after SMOTE: No-show
0    88208
1    88208
Name: count, dtype: int64


['scaler.joblib']

In [16]:
# Initialize the models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

# Train and evaluate the models
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report for {name}:\n{classification_report(y_test, y_pred)}\n")

Logistic Regression Accuracy: 0.7948520763593595
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.80      0.99      0.89     17669
           1       0.34      0.02      0.04      4437

    accuracy                           0.79     22106
   macro avg       0.57      0.51      0.46     22106
weighted avg       0.71      0.79      0.72     22106


Decision Tree Accuracy: 0.7727313851443047
Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.81      0.93      0.87     17669
           1       0.35      0.16      0.22      4437

    accuracy                           0.77     22106
   macro avg       0.58      0.54      0.54     22106
weighted avg       0.72      0.77      0.74     22106


Random Forest Accuracy: 0.772052836334027
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.82   

In [17]:
# Model training and parameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Best model selection
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'model.joblib')

# Evaluate the model
predictions = best_model.predict(X_test_scaled)
print("Accuracy on Test Set:", accuracy_score(y_test, predictions))
print("Classification Report on Test Set:\n", classification_report(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

# Print best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Accuracy on Test Set: 0.8000090473174704
Classification Report on Test Set:
               precision    recall  f1-score   support

           0       0.80      1.00      0.89     17669
           1       0.74      0.01      0.01      4437

    accuracy                           0.80     22106
   macro avg       0.77      0.50      0.45     22106
weighted avg       0.79      0.80      0.71     22106

Confusion Matrix:
 [[17660     9]
 [ 4412    25]]
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
