In [1]:
import sklearn
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
scikit_learn_version = sklearn.__version__
scikit_learn_version

'0.24.0'

In [9]:
!dir datasets

 Volume in drive C is Windows
 Volume Serial Number is 2642-66F6

 Directory of C:\Users\jon_9\Documents\GitHub\PS-flask-demo\datasets

01/11/2021  03:26 PM    <DIR>          .
01/11/2021  03:26 PM    <DIR>          ..
01/11/2021  03:22 PM             6,148 .DS_Store
01/11/2021  03:22 PM            12,401 automobiles_file1.csv
01/11/2021  03:22 PM            12,425 automobiles_file2.csv
01/11/2021  03:22 PM             6,593 automobiles_test.csv
01/11/2021  03:22 PM            26,717 CarPrice_Assignment.csv
01/11/2021  03:22 PM           447,540 sentimental_analysis_data.csv
01/11/2021  03:22 PM            57,685 sentimental_data_evaluation.csv
               7 File(s)        569,509 bytes
               2 Dir(s)  79,311,040,512 bytes free


In [10]:
auto_train = pd.read_csv('datasets/automobiles_file1.csv')
auto_train.shape

(82, 52)

In [11]:
auto_test = pd.read_csv('datasets/automobiles_test.csv')
auto_test.shape

(41, 52)

In [12]:
X_train = auto_train.drop('price', axis = 1)
y_train = auto_train['price']

X_test = auto_test.drop('price', axis = 1)
y_test = auto_test['price']

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((82, 51), (82,), (41, 51), (41,))

In [14]:
regressor_model = RandomForestRegressor(n_estimators = 5, 
                                        warm_start = True, 
                                        random_state = 42)

rfr_model = regressor_model.fit(X_train, y_train)

rfr_model

RandomForestRegressor(n_estimators=5, random_state=42, warm_start=True)

In [15]:
train_score = rfr_model.score(X_train, y_train)
train_score

0.9600736971765946

In [17]:
y_pred = rfr_model.predict(X_test)

test_score = r2_score(y_test, y_pred)
test_score

0.8392484060086534

In [18]:
rfr_model_param = {}

rfr_model_param['model'] = rfr_model
rfr_model_param['sklearn_version'] = scikit_learn_version
rfr_model_param['r2_score'] = test_score

In [19]:
rfr_model_param

{'model': RandomForestRegressor(n_estimators=5, random_state=42, warm_start=True),
 'sklearn_version': '0.24.0',
 'r2_score': 0.8392484060086534}

## Save and retrain model

In [21]:
import joblib

In [22]:
filename = 'models/rfr_model_chkpt.joblib'
joblib.dump(rfr_model_param, filename)

['models/rfr_model_chkpt.joblib']

In [23]:
joblib_model = joblib.load(filename)

In [25]:
joblib_model['model']

RandomForestRegressor(n_estimators=5, random_state=42, warm_start=True)

In [26]:
joblib_model['sklearn_version']

'0.24.0'

In [27]:
joblib_model['model'].n_estimators = 15
joblib_model['model']

RandomForestRegressor(n_estimators=15, random_state=42, warm_start=True)

In [28]:
auto_retrain = pd.read_csv('datasets/automobiles_file2.csv')
auto_retrain.shape

(82, 52)

In [29]:
X_train = auto_retrain.drop('price', axis = 1)
y_train = auto_retrain['price']

In [30]:
rfr_retrained_model = joblib_model['model'].fit(X_train, y_train)

In [31]:
retrained_train_score = rfr_retrained_model.score(X_train, y_train)
retrained_train_score

0.9696990216465319

In [32]:
y_pred = rfr_retrained_model.predict(X_test)

In [34]:
retrained_test_score = r2_score(y_test, y_pred)
retrained_test_score

0.8981476218142272

In [35]:
retrained_rfr_model_param = {}

retrained_rfr_model_param['model'] = rfr_retrained_model
retrained_rfr_model_param['sklearn_version'] = scikit_learn_version
retrained_rfr_model_param['r2_score'] = retrained_test_score

In [36]:
filename = 'models/retrained_rfr_model_chkpt.joblib'

joblib.dump(retrained_rfr_model_param, filename)

['models/retrained_rfr_model_chkpt.joblib']