In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Evaluation matrix for classification

In [8]:
# import dataset
heart_disease = pd.read_csv("../../dataset/heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [10]:
heart_disease.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [99]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier 

# Create X, Y
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create model
clf = RandomForestClassifier()

# Fit model
clf.fit(X_train, y_train)

# Make Prediction
y_pred = clf.predict(X_test)

# Evaluate model using evaluation function
print(f"Accuracy Score:{ accuracy_score(y_test, y_pred)}")
print(f"Precision Score: {precision_score(y_test, y_pred)}")
print(f"Roc Curv Score: {roc_auc_score(y_test, y_pred)}")
print(f"Recall Score: {recall_score(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred)}")
print(f"Score: {clf.score(X_test, y_test)}")
print(f"Cross Val Score: {cross_val_score(clf, X, y, cv=5)}")
print(f"Cross Val Mean Score: {np.mean(cross_val_score(clf, X, y, cv=5))}")


Accuracy Score:0.8688524590163934
Precision Score: 0.8529411764705882
Roc Curv Score: 0.8669181034482758
Recall Score: 0.90625
F1 Score: 0.8787878787878788
Score: 0.8688524590163934
Cross Val Score: [0.7704918  0.90163934 0.81967213 0.81666667 0.81666667]
Cross Val Mean Score: 0.8248633879781421


## Evaluation model using Scroling Parameter

In [101]:
cv_acc = cross_val_score(clf, X, y, cv=10, scoring="accuracy")
cv_precision = cross_val_score(clf, X, y, cv=10, scoring="precision")
cv_recall = cross_val_score(clf, X, y, cv=10, scoring="recall")
cv_f1_score = cross_val_score(clf, X, y, cv=10, scoring="f1")
print(f"Heart Disease Classifier Cross Validate Accuracy:{np.mean(cv_acc)*100:.2f}%")
print(f"Heart Disease Classifier Cross Validate Precision {np.mean(cv_precision)*100:.2f}%")
print(f"Heart Disease Classifier Cross Validate Recall: {np.mean(cv_recall)*100:.2f}%")
print(f"Heart Disease Classifier Cross Validate f1: {np.mean(cv_f1_score)*100:.2f}%")

Heart Disease Classifier Cross Validate Accuracy:83.46%
Heart Disease Classifier Cross Validate Precision 83.64%
Heart Disease Classifier Cross Validate Recall: 86.58%
Heart Disease Classifier Cross Validate f1: 84.71%


## Evaluation matrix for regression

In [171]:
# import dataset
salary_df = pd.read_csv("../../dataset/Salary_Data_extend.csv")
salary_df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0
...,...,...,...,...,...,...
370,35.0,Female,Bachelor's,Senior Marketing Analyst,8.0,85000.0
371,43.0,Male,Master's,Director of Operations,19.0,170000.0
372,29.0,Female,Bachelor's,Junior Project Manager,2.0,40000.0
373,34.0,Male,Bachelor's,Senior Operations Coordinator,7.0,90000.0


In [177]:
salary_df.dropna(inplace=True)

In [193]:
salary_df.isna().sum()

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64

In [216]:
from sklearn.metrics import r2_score, max_error, mean_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Create X, y
X = salary_df.drop("Salary", axis=1)
y = salary_df["Salary"]

# # Turn the category into number
categorical_features = ["Gender", "Education Level", "Job Title"]
one_hot = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

transformer = ColumnTransformer([("one_hot", one_hot, cat_cols)], remainder="passthrough")
transform_X = transformer.fit_transform(X)



# Split model
X_train, X_test, y_train, y_test = train_test_split(transform_X, y, test_size=0.2, random_state=42)


# Create Model
model = DecisionTreeRegressor()

# Fit model
model.fit(X_train, y_train)

# Make Prediction
y_pred = model.predict(X_test)

# Evaluate model using evaluation function
print(f"R2 Score: {r2_score(y_test, y_pred)}")
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")



R2 Score: 0.799798315339898
MAE: 11866.666666666666
MSE: 480000000.0


## Evaluation model using Scroling Parameter

In [220]:
np.random.seed(42)
cv_r2 = cross_val_score(model, transform_X, y, cv=3, scoring=None)
print(f"R2 Score: {r2_score(y_test, y_pred)}")
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")

# Mean squared error
cv_mse = cross_val_score(model, transform_X, y, cv=5, scoring="neg_mean_squared_error")
np.mean(cv_mse)

# Mean absolute error
cv_mae = cross_val_score(model, transform_X, y, cv=5, scoring="neg_mean_absolute_error")
np.mean(cv_mae)

-13646.288888888888