In [53]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from joblib import dump

In [6]:
df = pd.read_csv('./datasets/train_data_us.csv')
df.loc[df['last_price'] > 113000, 'price_class'] = 1
df.loc[df['last_price'] <= 113000, 'price_class'] = 0
features = df.drop(['last_price', 'price_class'], axis=1)
target = df['price_class']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features,target,test_size=0.2, random_state=42)
len(y_train),len(y_test)

(5196, 1299)

In [9]:
model = DecisionTreeClassifier(random_state=42, max_depth=3)
model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [15]:
y_pred = model.predict(X_test)

In [16]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

         0.0       0.86      0.87      0.87       656
         1.0       0.86      0.86      0.86       643

    accuracy                           0.86      1299
   macro avg       0.86      0.86      0.86      1299
weighted avg       0.86      0.86      0.86      1299



In [38]:
tree_param = {'criterion':['gini','entropy'],'max_depth':[2, 4, 8, 12], 'min_samples_leaf':[4,8], 'max_leaf_nodes':[7, 9]}

In [39]:
clf = GridSearchCV(DecisionTreeClassifier(), tree_param, cv=5, scoring=make_scorer(score_func=accuracy_score, greater_is_better=True))
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                   

In [48]:
y_pred = clf.predict(X_test)

In [49]:
y_pred = model.predict(X_test)

In [51]:
clf.best_params_

{'criterion': 'gini',
 'max_depth': 4,
 'max_leaf_nodes': 9,
 'min_samples_leaf': 4}

In [58]:
accuracy_score(y_test, y_pred)

0.8891454965357968

In [54]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

         0.0       0.91      0.85      0.88       706
         1.0       0.83      0.90      0.86       593

    accuracy                           0.87      1299
   macro avg       0.87      0.87      0.87      1299
weighted avg       0.87      0.87      0.87      1299



In [57]:
rfc = RandomForestClassifier().fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [61]:
params = {'criterion': 'gini',
 'max_depth': 4,
 'max_leaf_nodes': 9,
 'min_samples_leaf': 4}
model = DecisionTreeClassifier(**params, random_state=42)
model.fit(features, target)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features=None, max_leaf_nodes=9,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [62]:
# save model 
# first argument is model 
# second argument is path to file
import joblib
joblib.dump(model, 'model.joblib')

['model.joblib']