In [1]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from azureml.core import Workspace, Dataset
from azureml.core import Run
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

subscription_id = 'ac5709b8-dd2b-46e3-98ae-bf53155e0e80'
resource_group = 'nereva_rg'
workspace_name = 'e2r2s14mls0004'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='rx20_all_models')
data_df = dataset.to_pandas_dataframe()


In [2]:
def reduce_feature_values(data_df, column, number_of_feature_values):

   all_features = data_df[column].value_counts().index

   data_df[column] = np.where(data_df[column].isin(all_features[0:min(len(all_features), number_of_feature_values)]), data_df[column], 'other')

   return data_df


def one_hot_encode_feature(df, feature_name, feature_list):
    ohe = OneHotEncoder()
    transformed = ohe.fit_transform(df[[feature_name]])
    df[[feature_name + '_' + x for x in ohe.categories_[0]]] = pd.DataFrame(transformed.toarray(),  index=df.index)
    feature_list += [feature_name + '_' + x for x in ohe.categories_[0]]
    feature_list = [x for x in feature_list if x != feature_name]
    feature_list = [x for x in feature_list if x != feature_name+'_encoded']
    return df, feature_list

In [3]:
data_df = dataset.to_pandas_dataframe()

data_df = data_df[data_df["CUSTOMER_TYPE"] == "End-Customer"]
data_df = data_df[data_df["STATUS"] > 1]
data_df = data_df[data_df["SOLD_PRICE"] > 100]
    
data_df = reduce_feature_values(data_df,column="SOLD_COUNTRY", number_of_feature_values=10)

    
features = ['OPERATING_HOURS', 'STATUS', 'AGE', 'CONSTRUCTION_YEAR' , "BLACK_FORX_FLAG","SOLD_COUNTRY", "LEASING_FLAG","KEY_ACCOUNT_FLAG","SOLD_PACKAGE_SIZE",'CAPACITY','WHEEL_QT','MAST_HEIGHT',"MAST_TYPE","MATERIAL_NR"]

data_df, features = one_hot_encode_feature(data_df, 'MAST_TYPE', features)
data_df, features = one_hot_encode_feature(data_df, 'SOLD_COUNTRY', features)
data_df, features = one_hot_encode_feature(data_df, 'SOLD_PACKAGE_SIZE', features)
data_df, features = one_hot_encode_feature(data_df, 'MATERIAL_NR', features)


features = [x for x in features if x not in ['SOLD_YEAR', 'TRADER_ID', 'CUSTOMER_TYPE_encoded', 'BLACK_FORX_FLAG']]

X_train , X_test , y_train,y_test = train_test_split(data_df,data_df[["SOLD_PRICE","STATUS"]],test_size=0.2, random_state=42, stratify=data_df[["STATUS","MATERIAL_NR"]])


In [4]:
from sklearn.model_selection import GridSearchCV

parameters = {
        'n_estimators': [100,150,200],
        'max_depth': [18,25,32],
        'max_features': [0.6,0.7,0.8],
        'min_samples_split': [5,7,9]
}

rfr = RandomForestRegressor(random_state=42)
clf = GridSearchCV(estimator=rfr, param_grid=parameters, cv=5)

clf.fit(X_train[features], y_train["SOLD_PRICE"])

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid={'max_depth': [18, 25, 32],
                         'max_features': [0.6, 0.7, 0.8],
                         'min_samples_split': [5, 7, 9],
                         'n_estimators': [100, 150, 200]})

In [5]:
print(clf.best_params_)
print(clf.score(X_train[features], y_train["SOLD_PRICE"]))
print(clf.score(X_test[features], y_test["SOLD_PRICE"]))

{'max_depth': 25, 'max_features': 0.6, 'min_samples_split': 5, 'n_estimators': 200}
0.9376630856357344
0.7277186480842479
