In [21]:
# Import necessary libraries
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, mean_squared_error, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier # example model
import pandas as pd
import numpy as np

In [22]:
# save train csv file in train data
train_data = pd.read_csv(r"C:\Users\USER\Desktop\Fakson_web\Programs\lessons\Python\Jupyter\dataset\vicsdata\KaggleV2-May-2016.csv")
train_data.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [23]:
# import LabelEncoder from sklearn
from sklearn.preprocessing import LabelEncoder

# Drop the original column after splitting (optional)
drop_column = ['AppointmentDay','ScheduledDay','PatientId']
filt_data = train_data.drop(drop_column, axis=1)

# Assuming 'x_train' contains a column with string values
label_encoder = LabelEncoder()

# Apply the label encoder to the specific column (e.g., 'category_column')
categorical_columns = ['Neighbourhood', 'Gender']
label_encoder = LabelEncoder()
for col in categorical_columns:
    filt_data[col] = label_encoder.fit_transform(filt_data[col])

print(train_data.shape)
x = filt_data.drop('No-show',axis=1)
x.head()

(110527, 14)


Unnamed: 0,AppointmentID,Gender,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received
0,5642903,0,62,39,0,1,0,0,0,0
1,5642503,1,56,39,0,0,0,0,0,0
2,5642549,0,62,45,0,0,0,0,0,0
3,5642828,0,8,54,0,0,0,0,0,0
4,5642494,0,56,39,0,1,1,0,0,0


In [24]:
# Define the target (y) and features (X)
y = filt_data['No-show']  # Target

# Split the data into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1000)

# Check the shapes of x_train and y_train
print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")
y_train.head()

x_train shape: (88421, 10)
y_train shape: (88421,)


94836    Yes
81426    Yes
1273      No
82250    Yes
90167    Yes
Name: No-show, dtype: object

In [25]:
rl = RandomForestClassifier()
dc = DecisionTreeClassifier()
dc.fit(x_train, y_train) 

In [26]:
show_up_pred = dc.predict(x_train)
print(show_up_pred.shape)

(88421,)


In [27]:
model_accuracy = accuracy_score(show_up_pred, y_train)
print(f'Train model_accuracy: {model_accuracy}')
test_pred = dc.predict(x_test)
test_accuracy = accuracy_score(test_pred, y_test)
print(f'Train model_accuracy: {test_accuracy}')

Train model_accuracy: 1.0
Train model_accuracy: 0.709219216502307


In [28]:
# 'macro' calculates recall for each class and then averages them.
recall = recall_score(y_test, test_pred, average='macro')

# Step 7: Print the recall
print(f"Recall of the model: {recall:.2f}")


Recall of the model: 0.57


In [29]:
# Calculate the F1 score
f1 = f1_score(y_test, test_pred, average='macro')  # Use 'weighted' for multiclass, 'binary' for binary classification

# Print the F1 score
print(f'F1 Score: {f1}')

F1 Score: 0.5642010268166029


In [30]:
# Step 6: Calculate precision
# For multiclass classification, specify the `average` parameter.
# 'macro' calculates precision for each class and then averages them.
precision = precision_score(y_test, test_pred, average='macro')

# Step 7: Print the precision
print(f"Precision of the model: {precision:.2f}")

Precision of the model: 0.56


In [31]:
report = classification_report(y_test, test_pred)
print(f'report{report}')

report              precision    recall  f1-score   support

          No       0.82      0.81      0.82     17580
         Yes       0.30      0.32      0.31      4526

    accuracy                           0.71     22106
   macro avg       0.56      0.57      0.56     22106
weighted avg       0.72      0.71      0.71     22106



In [32]:
predictions_df = pd.DataFrame({'Predicted Age': test_pred, 'data_test':y_test, 'report':report})

# Save to CSV
predictions_df.to_csv(r'C:\Users\USER\Desktop\Fakson_web\Programs\lessons\Python\Jupyter\dataset\predicted_show_ups.csv', index=False)
predictions_df

Unnamed: 0,Predicted Age,data_test,report
48833,Yes,No,precision recall f1-score ...
16531,No,No,precision recall f1-score ...
45267,No,No,precision recall f1-score ...
104711,No,No,precision recall f1-score ...
11493,Yes,No,precision recall f1-score ...
...,...,...,...
20926,Yes,Yes,precision recall f1-score ...
40559,No,No,precision recall f1-score ...
57851,No,No,precision recall f1-score ...
88443,Yes,No,precision recall f1-score ...
