### ✅ Kütüphaneler Yüklendi

In [66]:
!pip install catboost



In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import randint
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')
import joblib

### ✅ Veriseti Yüklendi

In [68]:
data = pd.read_csv("car_acceptability.txt", sep=",", header=None)
data.columns = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,?,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,?,unacc


### ✅ EDA İşlemleri

In [69]:
# Verisetimizi tanıyalım:
## buying : Arabanın satınalma fiyatı (v-high, high, med, low)  Örnek: low, med, high
## maint : Arabnın bakım masrafı (v-high, high, med, low)  Örnek: low, med, high
## doors : Arabanın kapı sayısı (2, 3, 4, 5-more)  Örnek: 2, 3, 4
## persons : Kişi kapasitesi (2, 4, more)  Örnek: 2, 4, more
## lug_boot : Bagaj büyüklüğü (small, med, big)  Örnek: small, med, big
## safety : Arabanın güvenlik ölçsü (low, med, high)  Örnek: low, med, high
## class : Arabanın satınalma kabul ölçütü (unacc: Alınması kabul edilmez, acc: Alınabilir, good: Alınırsa iyi olur, v-good: Alınırsa çok iyi olur)  Örnek: unacc, acc, good

In [70]:
# Kopya üzerinde çalışmak her zaman daha güvenli...
dt = data.copy()

In [71]:
# Verisetimizin satır ve sütun sayısını öğrendik
dt.shape

(1729, 7)

In [72]:
dt.info()
# Verisetimiz hakkında genel bilgiler edindik

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1729 entries, 0 to 1728
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1729 non-null   object
 2   doors     1729 non-null   object
 3   persons   1729 non-null   object
 4   lug_boot  1729 non-null   object
 5   safety    1728 non-null   object
 6   class     1729 non-null   object
dtypes: object(7)
memory usage: 94.7+ KB


In [73]:
object_columns = dt.select_dtypes("object").columns
for col in object_columns:
  dt[col] = dt[col].astype('category')

In [74]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1729 entries, 0 to 1728
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   buying    1728 non-null   category
 1   maint     1729 non-null   category
 2   doors     1729 non-null   category
 3   persons   1729 non-null   category
 4   lug_boot  1729 non-null   category
 5   safety    1728 non-null   category
 6   class     1729 non-null   category
dtypes: category(7)
memory usage: 13.7 KB


In [75]:
dt.isna().sum()
# Buying ve Safety özelliklerinde null değerler olduğunu gördük.

buying      1
maint       0
doors       0
persons     0
lug_boot    0
safety      1
class       0
dtype: int64

In [76]:
def valueCounts_of_Data(par):
  if par == 'All':
    for col in dt.columns:
      print(dt[col].name)
      print(dt[col].value_counts())
      print("\n")
  else:
    print(dt[par].name)
    print(dt[par].value_counts())

# Özelliklerin değer sayısını görebileceğimiz bir fonksiyon oluşturduk

In [77]:
valueCounts_of_Data('All')

buying
high     432
med      431
low      430
vhigh    430
?          3
-          1
düsük      1
Name: buying, dtype: int64


maint
high     432
low      432
med      432
vhigh    429
?          4
Name: maint, dtype: int64


doors
3        432
2        430
4        430
5more    430
?          3
-          1
44         1
5+         1
iki        1
Name: doors, dtype: int64


persons
4       576
more    576
2       574
?         3
Name: persons, dtype: int64


lug_boot
big      576
small    576
med      575
?          2
Name: lug_boot, dtype: int64


safety
high    576
med     574
low     573
-         2
?         2
*         1
Name: safety, dtype: int64


class
unacc    1209
acc       384
good       69
vgood      65
?           2
Name: class, dtype: int64




#### ✔ Hatalı Yazımları Düzeltme İşlemleri

In [78]:
valueCounts_of_Data('buying')

buying
high     432
med      431
low      430
vhigh    430
?          3
-          1
düsük      1
Name: buying, dtype: int64


In [79]:
dt["buying"] = dt["buying"].str.replace('düsük',"low")
valueCounts_of_Data('buying')

buying
high     432
med      431
low      431
vhigh    430
?          3
-          1
Name: buying, dtype: int64


In [80]:
valueCounts_of_Data('doors')

doors
3        432
2        430
4        430
5more    430
?          3
-          1
44         1
5+         1
iki        1
Name: doors, dtype: int64


In [81]:
dt["doors"] = dt["doors"].str.replace('iki','2')
dt["doors"] = dt["doors"].str.replace('44','4')
dt["doors"] = dt["doors"].str.replace('+','more')
valueCounts_of_Data('doors')

doors
3        432
2        431
4        431
5more    431
?          3
-          1
Name: doors, dtype: int64


#### ✔ Herhangi bir sınıfı olmayan kayıtların işlemleri

In [82]:
for col in dt.columns:
  dt[col] = dt[col].replace(to_replace = '?', value=np.NAN)
  dt[col] = dt[col].replace(to_replace = '*', value=np.NAN)
  dt[col] = dt[col].replace(to_replace = '-', value=np.NAN)

# Şüpheli tüm değerleri "NAN" yaptık

In [83]:
valueCounts_of_Data('All')

buying
high     432
med      431
low      431
vhigh    430
Name: buying, dtype: int64


maint
high     432
low      432
med      432
vhigh    429
Name: maint, dtype: int64


doors
3        432
2        431
4        431
5more    431
Name: doors, dtype: int64


persons
4       576
more    576
2       574
Name: persons, dtype: int64


lug_boot
big      576
small    576
med      575
Name: lug_boot, dtype: int64


safety
high    576
med     574
low     573
Name: safety, dtype: int64


class
unacc    1209
acc       384
good       69
vgood      65
Name: class, dtype: int64




In [84]:
dt.isna().sum()
# NAN değerleri görelim

buying      5
maint       4
doors       4
persons     3
lug_boot    2
safety      6
class       2
dtype: int64

In [85]:
dt[dt.isnull().any(axis=1)]
# Sadece NaN olan satırları görelim

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
2,vhigh,vhigh,2,,small,high,unacc
4,vhigh,vhigh,2,2,med,,unacc
13,vhigh,,2,4,med,med,unacc
21,,,2,more,,low,unacc
61,,vhigh,,2,big,med,
951,med,,5more,2,big,low,unacc
968,med,vhigh,,more,med,high,acc
975,med,high,2,2,med,,unacc
993,,high,2,more,med,low,unacc
1048,med,high,4,more,med,,acc


In [86]:
dtm = dt.copy()
# Yeni bir kopya üzerinde çalışalım ...

In [87]:
dtm.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2.0,small,low,unacc
1,vhigh,vhigh,2,2.0,small,med,unacc
2,vhigh,vhigh,2,,small,high,unacc
3,vhigh,vhigh,2,2.0,med,low,unacc
4,vhigh,vhigh,2,2.0,med,,unacc


In [88]:
dtm = dtm.dropna()
# NAN değer içeren kayıtları silelim.
# NAN değer içeren kayıt sayımız çok fazla değil. Gözardı edilebilir.
# Ayrıca eksik değer doldurması yaparak çok gerçekçi bir noktaya gelemeyeceğiz
# Sınıflandırma algoritmalarında aykırı değerlerin bir önemi yok denebilir. BU aşamada eksik değerler içinde aynısı düşünülebilir.

### ✅ Model Bölümleme ve Hazırlık

In [89]:
mdl = dtm.copy()
mdl.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
3,vhigh,vhigh,2,2,med,low,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc


In [90]:
# Hedef özelliğimizi verisetinden ayırdık.
X = mdl.drop('class', axis=1)
y = mdl['class']

In [91]:
for col in X.columns:
  X[col] = LabelEncoder().fit_transform(X[col])
# Makine öğrenimi sayıları sever. LabelEncoder ile sınıflarımızı sayısallaştıralım. Böylece öğrenme hızlansın.

In [92]:
scaler = StandardScaler()
X.iloc[:,:] = scaler.fit_transform(X.iloc[:,:])
# Sınıflarımızın değerlerini 0 ile 1 arasına çekelim. Böylece daha işlerlikli bir verisetine olsun.
# CatBoost gibi algoritmalar için bu aşamalar gözardı edilebilir. Çünkü kendi içerisinde yapıyor.

In [93]:
print(X.head());

     buying     maint     doors   persons  lug_boot    safety
0  1.344573  1.348453 -1.345723 -1.227787  1.222783  0.000714
1  1.344573  1.348453 -1.345723 -1.227787  1.222783  1.224210
3  1.344573  1.348453 -1.345723 -1.227787 -0.000714  0.000714
5  1.344573  1.348453 -1.345723 -1.227787 -0.000714 -1.222783
6  1.344573  1.348453 -1.345723 -1.227787 -1.224210  0.000714


In [94]:
# Verisetini bölümledik.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=28)

In [95]:
print(f"X shape : {X.shape}")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print("\n")
print(f"y shape : {y.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X shape : (1714, 6)
X_train shape: (1371, 6)
X_test shape: (343, 6)


y shape : (1714,)
y_train shape: (1371,)
y_test shape: (343,)


In [96]:
X_train.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
1683,-0.44263,-0.444603,0.446483,-0.002145,1.222783,0.000714
748,-1.336231,0.451925,1.342586,1.223497,1.222783,1.22421
1286,0.450971,-0.444603,1.342586,-0.002145,-1.22421,-1.222783
1161,0.450971,0.451925,1.342586,-1.227787,1.222783,0.000714
119,1.344573,-1.34113,-1.345723,-0.002145,1.222783,-1.222783


## ✅ Modeller

In [97]:
def dsModel_metrics(par_test, par_pred, par_model):
  print(f"Accuracy Score : {accuracy_score(par_test, par_pred)}");
  print(f"\nCross Validation Score: {(cross_val_score(par_model, X, y, cv=5).mean())}");
  print("\nConfusion Matrix :");
  print(confusion_matrix(par_test,par_pred));
  print("\nClassification Report :");
  print(classification_report(par_test, par_pred));

### ⚡ Model : Decision Tree

#### ✔ Model Kuralım

In [98]:
dt_model = DecisionTreeClassifier(criterion='entropy', splitter = 'best', max_depth=7, min_samples_split = 3, max_features = 5, random_state=53)

In [99]:
dt_model.fit(X_train, y_train)

In [100]:
dt_model_pred = dt_model.predict(X_test)

In [101]:
dsModel_metrics(y_test, dt_model_pred, dt_model)

Accuracy Score : 0.892128279883382

Cross Validation Score: 0.7765928426508448

Confusion Matrix :
[[ 60   0   2   5]
 [ 15   0   1   0]
 [ 12   0 240   0]
 [  2   0   0   6]]

Classification Report :
              precision    recall  f1-score   support

         acc       0.67      0.90      0.77        67
        good       0.00      0.00      0.00        16
       unacc       0.99      0.95      0.97       252
       vgood       0.55      0.75      0.63         8

    accuracy                           0.89       343
   macro avg       0.55      0.65      0.59       343
weighted avg       0.87      0.89      0.88       343



#### ✔ Tune Edelim

In [102]:
param_dist = {
    "max_depth": [3, None],
    "max_features": randint(1, 9),
    "min_samples_leaf": randint(1, 9),
    "criterion": ["gini", "entropy"]
}

In [103]:
tune_model = DecisionTreeClassifier()

In [104]:
random_search = RandomizedSearchCV(tune_model, param_distributions=param_dist, cv=5, n_iter=5, verbose = 1)

In [105]:
dt_tune = random_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [106]:
print("Tuned Parameters: {}".format(dt_tune.best_params_))
print("Best score is {}".format(dt_tune.best_score_))

Tuned Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 6, 'min_samples_leaf': 3}
Best score is 0.95550099535501


### ⚡ Model : Random Forest Classifier

#### ✔ Model Kuralım

In [107]:
rf_model = RandomForestClassifier (n_estimators = 200,
                                  criterion = 'entropy',
                                  max_depth = 5,
                                  min_samples_split= 5,
                                  max_features = 3,
                                  bootstrap = True,
                                  max_samples = 0.6 )

In [108]:
rf_model.fit(X_train, y_train)

In [109]:
rf_model_pred = rf_model.predict(X_test)

In [110]:
dsModel_metrics(y_test, rf_model_pred, rf_model)

Accuracy Score : 0.8775510204081632

Cross Validation Score: 0.7695974630453686

Confusion Matrix :
[[ 54   0  13   0]
 [ 16   0   0   0]
 [  7   0 245   0]
 [  6   0   0   2]]

Classification Report :
              precision    recall  f1-score   support

         acc       0.65      0.81      0.72        67
        good       0.00      0.00      0.00        16
       unacc       0.95      0.97      0.96       252
       vgood       1.00      0.25      0.40         8

    accuracy                           0.88       343
   macro avg       0.65      0.51      0.52       343
weighted avg       0.85      0.88      0.86       343



#### ✔ Tune edelim

In [111]:
param_dist = {'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72, 80],
              'max_features': ['auto', 'sqrt'],
              'max_depth': [2, 4],
              'min_samples_split': [2, 5],
              'min_samples_leaf': [1, 2],
              'bootstrap': [True, False]
              }

In [112]:
tune_model = RandomForestClassifier()

In [113]:
random_search = RandomizedSearchCV(tune_model, param_distributions=param_dist, cv=5, n_iter=5, verbose = 2, n_jobs = 4)

In [114]:
rf_tune = random_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [115]:
print("Tuned Parameters: {}".format(rf_tune.best_params_))
print("Best score is {}".format(rf_tune.best_score_))

Tuned Parameters: {'n_estimators': 41, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 4, 'bootstrap': False}
Best score is 0.8015952222959524


### ⚡ Model : CatBoost Classifier

In [116]:
tune_model = CatBoostClassifier(n_estimators=300, task_type="GPU")

In [117]:
param_dist = {
  "learning_rate": [0.03, 0.1, 0.3],
  "depth": [3, 5, 7],
  "l2_leaf_reg": [1, 3, 5]
}

In [118]:
random_search = RandomizedSearchCV(tune_model, param_distributions=param_dist, cv=3, n_iter=10, verbose = 2)

In [119]:
cb_tune = random_search.fit(X_train, y_train, verbose=False)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ..........depth=7, l2_leaf_reg=3, learning_rate=0.1; total time=  10.8s
[CV] END ..........depth=7, l2_leaf_reg=3, learning_rate=0.1; total time=  12.2s
[CV] END ..........depth=7, l2_leaf_reg=3, learning_rate=0.1; total time=   2.9s
[CV] END .........depth=5, l2_leaf_reg=5, learning_rate=0.03; total time=   1.5s
[CV] END .........depth=5, l2_leaf_reg=5, learning_rate=0.03; total time=   1.5s
[CV] END .........depth=5, l2_leaf_reg=5, learning_rate=0.03; total time=   1.5s
[CV] END ..........depth=3, l2_leaf_reg=1, learning_rate=0.3; total time=   1.7s
[CV] END ..........depth=3, l2_leaf_reg=1, learning_rate=0.3; total time=   2.5s
[CV] END ..........depth=3, l2_leaf_reg=1, learning_rate=0.3; total time=   1.3s
[CV] END .........depth=7, l2_leaf_reg=1, learning_rate=0.03; total time=   2.2s
[CV] END .........depth=7, l2_leaf_reg=1, learning_rate=0.03; total time=   2.3s
[CV] END .........depth=7, l2_leaf_reg=1, learni

In [120]:
print("Tuned Parameters: {}".format(cb_tune.best_params_))
print("Best score is {}".format(cb_tune.best_score_))

Tuned Parameters: {'learning_rate': 0.3, 'l2_leaf_reg': 1, 'depth': 3}
Best score is 0.9912472647702407


## ✅ Modeli Dışa Aktarma

In [121]:
joblib.dump(cb_tune, 'Araba.joblib')

['Araba.joblib']