In [1]:
# import libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score , r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [31]:
# import dataset of iris
df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [38]:
# split the data
X = df.drop('cut' , axis=1)
y = df['cut']

# encode the input variables
le = LabelEncoder()
X['color'] = le.fit_transform(X['color'])
X['clarity'] = le.fit_transform(X['clarity'])
# encode the target variable
y = le.fit_transform(y)

X_train , X_test , y_train , y_test = train_test_split(X, y , test_size=0.8 , random_state=42)


In [39]:
%%time
# train th decision tree classifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train , y_train)
y_pred = dtc.predict(X_test)

# print the matrix
print(f'accuracy_score : {accuracy_score(y_test , y_pred)}')
print(f'precision_score : {precision_score(y_test , y_pred , average="macro")}')
print(f'recall_score : {recall_score(y_test , y_pred , average="macro")}')
print(f'f1_score : {f1_score(y_test , y_pred , average="macro")}')


accuracy_score : 0.680084352984798
precision_score : 0.6824864851661705
recall_score : 0.6764079112398782
f1_score : 0.679315815349331
CPU times: user 291 ms, sys: 976 μs, total: 292 ms
Wall time: 290 ms


In [64]:

# train the random forest classifier
rfc = RandomForestClassifier(n_estimators=300, random_state=42)
rfc.fit(X_train , y_train)
y_pred = rfc.predict(X_test)

# print the matrix
print(f'accuracy_score : {accuracy_score(y_test , y_pred)}')
print(f'precision_score : {precision_score(y_test , y_pred , average="macro")}')
print(f'recall_score : {recall_score(y_test , y_pred , average="macro")}')
print(f'f1_score : {f1_score(y_test , y_pred , average="macro")}')

accuracy_score : 0.7626529477196885
precision_score : 0.7743812944168045
recall_score : 0.7384791150083592
f1_score : 0.7523269974270915


In [55]:

# train the xgboost model
xgb = XGBClassifier(n_estimators=300 , max_depth=5 , random_state=42)
xgb.fit(X_train , y_train)
y_pred = xgb.predict(X_test)

# print the matrix
print(f'accuracy_score : {accuracy_score(y_test , y_pred)}')
print(f'precision_score : {precision_score(y_test , y_pred , average="macro")}')
print(f'recall_score : {recall_score(y_test , y_pred , average="macro")}')
print(f'f1_score : {f1_score(y_test , y_pred , average="macro")}')


accuracy_score : 0.7800565443084909
precision_score : 0.7884973816201853
recall_score : 0.7643424099735553
f1_score : 0.7750805043742544
