In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

In [None]:
train = pd.read_csv('../data/Train_dataset.csv')
train.head(1)

In [None]:
from auto_ml_kinder import pre_processing as pp

In [None]:
train.columns = train.columns.str.lower().str.replace(' ', '_')

In [None]:
train.head(1)

In [None]:
label_encode:list[pp.PreLabelEncoderConfig] = []
label_encode.append(pp.PreLabelEncoderConfig(
    column_name='city',
    label_encoding=list({pp.LabelEncodingDict(key, idx + 1) for idx, key in enumerate(train.city.value_counts().keys())}))
    )
label_encode.append(pp.PreLabelEncoderConfig(
    column_name='restaurant_location',
    label_encoding=list({pp.LabelEncodingDict(key, idx + 1) for idx, key in enumerate(train.restaurant_location.value_counts().keys())}))
    )
label_encode.append(pp.PreLabelEncoderConfig(
    column_name='endorsed_by',
    label_encoding=list({pp.LabelEncodingDict(key, idx + 1) for idx, key in enumerate(train.endorsed_by.value_counts().keys())}))
    )
label_encode.append(pp.PreLabelEncoderConfig(
    column_name='restaurant_theme',
    label_encoding=list({pp.LabelEncodingDict(key, idx + 1) for idx, key in enumerate(train.restaurant_theme.value_counts().keys())}))
    )
label_encode.append(pp.PreLabelEncoderConfig(
    column_name='restaurant_type',
    label_encoding=list({pp.LabelEncodingDict(key, idx + 1) for idx, key in enumerate(train.restaurant_type.value_counts().keys())}))
    )
label_encode.append(pp.PreLabelEncoderConfig(
    column_name='cuisine',
    label_encoding=list({pp.LabelEncodingDict(key, idx + 1) for idx, key in enumerate(train.cuisine.value_counts().keys())}))
    )

numeric_col_changer:list[pp.PreNumericColDataChangeConfig] = []
numeric_col_changer.append(pp.PreNumericColDataChangeConfig(col_name='facebook_popularity_quotient',data_type=int))
numeric_col_changer.append(pp.PreNumericColDataChangeConfig(col_name='instagram_popularity_quotient',data_type=int))
numeric_col_changer.append(pp.PreNumericColDataChangeConfig(col_name='overall_restaurant_rating',data_type=int))
numeric_col_changer.append(pp.PreNumericColDataChangeConfig(col_name='live_music_rating',data_type=int))
numeric_col_changer.append(pp.PreNumericColDataChangeConfig(col_name='ambience',data_type=int))
numeric_col_changer.append(pp.PreNumericColDataChangeConfig(col_name='resturant_tier',data_type=int))

model_config = pp.PreProcessingConfig(
    encoding_dummies=[],
    exclude_columns=['opening_day_of_restaurant','registration_number'],
    label_encode=label_encode,
    numeric_cols_data_changer=numeric_col_changer,
    target_column='annual_turnover'
)

In [None]:
pre_processed_df = pp.process(train,model_config)

In [None]:
from auto_ml_kinder import model_training_data_prep as dp
from auto_ml_kinder import model_training_helper as mth
from auto_ml_kinder import model_list_helper as mlh

In [None]:
data = dp.ModelTrainingData(pre_processed_df, dp.ScalerType.QUANTILE_TRANSFORMER, .90,use_pca=False,use_polynomials=False,use_feature_selection=False,create_clustering_feature_and_no_of_clusters=(True,5))

In [None]:
trainer = mth.ModelTrainer(data=data)

In [None]:
trainer.perform_operation_regression(exclude_models=[mlh.ModelAndParam.SVR_Regression,mlh.ModelAndParam.DecisionTree_Regressor,mlh.ModelAndParam.RandomForest_Regressor,mlh.ModelAndParam.GradientBoosting_Regressor,mlh.ModelAndParam.KNeighbors_Regressor],permutate_n_less_column=0)
trainer.performance_df.loc[trainer.performance_df['score'].idxmax()]

In [None]:
data = dp.ModelTrainingData(pre_processed_df, dp.ScalerType.QUANTILE_TRANSFORMER, .90,use_pca=False,use_polynomials=False,use_feature_selection=True,create_clustering_feature_and_no_of_clusters=(True,5))

In [None]:
trainer.data = data
trainer.perform_operation_regression(exclude_models=[mlh.ModelAndParam.SVR_Regression,mlh.ModelAndParam.DecisionTree_Regressor,mlh.ModelAndParam.RandomForest_Regressor,mlh.ModelAndParam.GradientBoosting_Regressor,mlh.ModelAndParam.KNeighbors_Regressor],permutate_n_less_column=0)
trainer.performance_df.loc[trainer.performance_df['score'].idxmax()]

In [None]:
data = dp.ModelTrainingData(pre_processed_df, dp.ScalerType.QUANTILE_TRANSFORMER, .90,use_pca=True,use_polynomials=True,use_feature_selection=True,create_clustering_feature_and_no_of_clusters=(True,5))

In [None]:
trainer.data = data
trainer.perform_operation_regression(exclude_models=[mlh.ModelAndParam.SVR_Regression,mlh.ModelAndParam.DecisionTree_Regressor,mlh.ModelAndParam.RandomForest_Regressor,mlh.ModelAndParam.GradientBoosting_Regressor,mlh.ModelAndParam.KNeighbors_Regressor],permutate_n_less_column=0)
trainer.performance_df.loc[trainer.performance_df['score'].idxmax()]

In [None]:
data = dp.ModelTrainingData(pre_processed_df, dp.ScalerType.QUANTILE_TRANSFORMER, .95,use_pca=True,use_polynomials=True,use_feature_selection=True,create_clustering_feature_and_no_of_clusters=(True,5))

In [None]:
trainer.data = data
trainer.perform_operation_regression(exclude_models=[mlh.ModelAndParam.SVR_Regression,mlh.ModelAndParam.DecisionTree_Regressor,mlh.ModelAndParam.RandomForest_Regressor,mlh.ModelAndParam.GradientBoosting_Regressor,mlh.ModelAndParam.KNeighbors_Regressor],permutate_n_less_column=0)
trainer.performance_df.loc[trainer.performance_df['score'].idxmax()]

In [None]:
trainer.performance_df

In [None]:
trainer.perform_neural_network_regression(totalExperiments=6,
                                          params=mth.NeuralNetwork_BayesianOptimization_Params(
                                              neurons_min_max=(256,512)
                                              ,batch_size_min_max=(32,32)
                                              ,dropout_rate_min_max=(.3,.7)
                                              ,epochs_min_max=(60,70)
                                              ,hidden_layers_min_max=(3,6)  
                                              ,learning_rate_min_max=(.001,.1)
                                              ,normalization_min_max=(0,1)
                                              ,dropout_min_max=(1,1)
                                              ,activation_min_max=(0,3)
                                              ))

In [None]:
trainer.neural_network_best_model(epochs=500)

In [None]:
trainer.performance_df

In [None]:
test = pd.read_csv('../data/Test_dataset.csv')

In [None]:
test.columns = test.columns.str.lower().str.replace(' ', '_')

In [None]:
new_column_name = label_encode[2].column_name
test = test.rename(columns={'endoresed_by': new_column_name})

In [None]:
label_encode[0] = pp.PreLabelEncoderConfig(
    column_name='city',
    label_encoding=list({pp.LabelEncodingDict(key, idx + 1) for idx, key in enumerate(test.city.value_counts().keys())}))

In [None]:
model_config_test = pp.PreProcessingConfig(
    encoding_dummies=[],
    exclude_columns=['opening_day_of_restaurant','registration_number'],
    label_encode=label_encode,
    numeric_cols_data_changer=numeric_col_changer,
    target_column=''
)

In [None]:
test_processed = pp.process_test(test,model_config)

In [None]:
trainer = mth.ModelTrainer(data=data)
trainer.perform_operation_regression(exclude_models=[mlh.ModelAndParam.SVR_Regression,mlh.ModelAndParam.DecisionTree_Regressor,mlh.ModelAndParam.RandomForest_Regressor,mlh.ModelAndParam.GradientBoosting_Regressor,mlh.ModelAndParam.KNeighbors_Regressor],permutate_n_less_column=0)
trainer.performance_df.loc[trainer.performance_df['score'].idxmax()]

In [None]:
trainer.performance_df

In [None]:
trainer.models

In [None]:
predictions = trainer.predict_test_data(test_processed,trainer.models[1].model)

In [None]:
test['predictions'] = predictions

In [None]:
test