In [43]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, validation_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import *
from joblib import dump

In [3]:
df = pd.read_csv('/datasets/train_data_us.csv')
df.loc[df['last_price'] > 113000, 'price_class'] = 1
df.loc[df['last_price'] <= 113000, 'price_class'] = 0
features = df.drop(['last_price', 'price_class'], axis=1)
target = df['price_class']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(features,target,test_size=0.2, random_state=42)
len(y_train),len(y_test)

(5196, 1299)

In [12]:
tree_param = {'n_estimators':list(range(1, 11)),'max_depth':[2, 4, 8, 12, 16, 20], 'min_samples_leaf':[4,8, 10, 12], 'max_leaf_nodes':[7, 9]}

In [13]:
clf = GridSearchCV(RandomForestClassifier(), tree_param, cv=5, scoring=make_scorer(score_func=accuracy_score, greater_is_better=True))
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [14]:
clf.best_params_

{'max_depth': 20,
 'max_leaf_nodes': 9,
 'min_samples_leaf': 10,
 'n_estimators': 8}

In [31]:
params = {'max_depth': 12,
 'max_leaf_nodes': 9,
 'min_samples_leaf': 8}
model = RandomForestClassifier(n_estimators=25)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=25,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [32]:
y_pred = model.predict(X_test)

In [33]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

         0.0       0.91      0.88      0.89       677
         1.0       0.88      0.90      0.89       622

    accuracy                           0.89      1299
   macro avg       0.89      0.89      0.89      1299
weighted avg       0.89      0.89      0.89      1299



In [29]:
train_scoreNum, test_scoreNum = validation_curve(
                                RandomForestClassifier(),
                                X = X_train, y = y_train, 
                                param_name = 'n_estimators', 
                                param_range = range(1,25), cv = 3)

In [30]:
train_scoreNum, test_scoreNum

(array([[0.94195784, 0.93677829, 0.93997114],
        [0.93011839, 0.93620092, 0.94372294],
        [0.97458851, 0.97806005, 0.97662338],
        [0.97372221, 0.97199769, 0.97344877],
        [0.98469535, 0.98325635, 0.98556999],
        [0.98700549, 0.98527714, 0.98008658],
        [0.99047069, 0.99076212, 0.98903319],
        [0.99133699, 0.98960739, 0.99047619],
        [0.99335836, 0.99422633, 0.99336219],
        [0.99364713, 0.99393764, 0.995671  ],
        [0.99335836, 0.99624711, 0.99480519],
        [0.99278083, 0.99422633, 0.99307359],
        [0.99624603, 0.9965358 , 0.9965368 ],
        [0.99595726, 0.99393764, 0.9962482 ],
        [0.99480219, 0.9948037 , 0.9962482 ],
        [0.9965348 , 0.99566975, 0.997114  ],
        [0.99682356, 0.99711316, 0.9968254 ],
        [0.9965348 , 0.99769053, 0.9959596 ],
        [0.99711233, 0.99797921, 0.9979798 ],
        [0.99711233, 0.99740185, 0.9979798 ],
        [0.9991337 , 0.99740185, 0.9988456 ],
        [0.99768986, 0.99769053, 0

In [36]:
len(train_scoreNum)

24

In [34]:
import joblib
joblib.dump(model, 'rfclassifier.joblib')

['rfclassifier.joblib']

In [37]:
model = LogisticRegression(random_state=12345)
model.fit(X_train, y_train)
joblib.dump(model, 'logreg.joblib')



['logreg.joblib']

In [39]:
df = pd.read_csv('/datasets/train_data_us.csv')
features = df.drop(['last_price'], axis=1)
target = df['last_price'] / 100000
X_train, X_test, y_train, y_test = train_test_split(features,target,test_size=0.2, random_state=42)
len(y_train),len(y_test)

(5196, 1299)

In [45]:
model = DecisionTreeRegressor().fit(X_train, y_train)
# joblib.dump(model, 'dtr.joblib')
y_pred = model.predict(X_test)
mean_squared_error(y_test, y_pred)

4.991321516948247

In [46]:
model = RandomForestRegressor(n_estimators=3).fit(X_train, y_train)
# joblib.dump(model, 'rtr.joblib')
y_pred = model.predict(X_test)
mean_squared_error(y_test, y_pred)

1.7793841580380398

In [47]:
model = LinearRegression().fit(X_train, y_train)
# joblib.dump(model, 'lr.joblib')
y_pred = model.predict(X_test)
mean_squared_error(y_test, y_pred)

2.017432758829753

For decision tree model, iterate over max_depth values from 1 to 4.
For random forest model, iterate over n_estimators values from 10 to 50. Take only factors of 10 (10, 20, 30, 40, 50). Limit the maximum depth: max_depth=10.

In [51]:
tree_param = {'n_estimators':list(range(10, 51, 10)),'max_depth':[10]}
clf = GridSearchCV(RandomForestRegressor(), tree_param, cv=5, scoring=make_scorer(score_func=mean_squared_error))
clf.fit(features, target)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [10],
            

In [53]:
clf.best_params_

{'max_depth': 10, 'n_estimators': 40}

In [54]:
params = {'max_depth': 10, 'n_estimators': 40}
model = RandomForestRegressor(**params).fit(features, target)

In [56]:
joblib.dump(model, 'rfr.joblib')

['rfr.joblib']

In [55]:
df = pd.read_csv('/datasets/train_data_us.csv')
features = df.drop(['last_price'], axis=1)
target = df['last_price'] / 100000
test_df = pd.read_csv('/datasets/test_data_full_us.csv')
test_features = test_df.drop(['last_price'], axis=1)
test_target = test_df['last_price'] / 100000
train_predictions = model.predict(features)
test_predictions = model.predict(test_features)
rmse_train = mean_squared_error(target, train_predictions)**0.5
rmse_test = mean_squared_error(test_target, test_predictions)**0.5
# assert rmse_test <= 1.5
print("RMSE")
print("Training set:", rmse_train)
print("Test set:", rmse_test)

RMSE
Training set: 0.6518891054365964
Test set: 1.412539358035851
