#Step 1: Import Data



##1.1 Import packages


In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        col_type2 = df[col].dtype.name
        
        if ((col_type != object) and (col_type2 != 'category')):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
import pandas as pd
import numpy as np
import gc #Garbage Collector to free up memory
gc.enable

<function gc.enable>

In [None]:
import pandas as pd
new_features = pd.read_csv('/content/gdrive/My Drive/Dataset/new_features.csv')

In [None]:
orders = reduce_mem_usage(pd.read_csv('/content/gdrive/My Drive/Dataset/orders.csv'))

Memory usage of dataframe is 182.71 MB
Memory usage after optimization is: 45.68 MB
Decreased by 75.0%


In [None]:
order_products__prior = pd.read_csv('/content/gdrive/My Drive/Dataset/order_products__prior.csv')

In [None]:
order_products__train = reduce_mem_usage(pd.read_csv('/content/gdrive/My Drive/Dataset/order_products__train.csv'))

Memory usage of dataframe is 42.26 MB
Memory usage after optimization is: 13.20 MB
Decreased by 68.7%


#3.Split into Train and Test DataFrame

In [None]:
new_features.shape

(13307953, 20)

In [None]:
new_features.head()

Unnamed: 0.1,Unnamed: 0,user_id,product_id,up_total_bought,up_reorder_ratio,number_times_last5,total_items,total_distinct_items,user_average_days_between_orders,user_number_orders,user_avg_basket,user_total_orders,user_reordered_ratio,product_orders_total,product_reorder_total,product_reorder_rate,aisle_id,department_id,eval_set,order_id
0,0,1,196,10,1.0,5.0,59,18,19.0,11,5.363,10,0.695,35791,27791.0,0.7764,77,7,train,1187899
1,1,1,10258,9,1.0,5.0,59,18,19.0,11,5.363,10,0.695,1946,1389.0,0.714,117,19,train,1187899
2,2,1,10326,1,0.1666,0.0,59,18,19.0,11,5.363,10,0.695,5526,3603.0,0.652,24,4,train,1187899
3,3,1,12427,10,1.0,5.0,59,18,19.0,11,5.363,10,0.695,6476,4797.0,0.7407,23,19,train,1187899
4,4,1,13032,3,0.3333,2.0,59,18,19.0,11,5.363,10,0.695,3751,2465.0,0.657,121,14,train,1187899


In [None]:
new_features = new_features.drop(['Unnamed: 0'], axis=1)

In [None]:
new_features.head()

Unnamed: 0,user_id,product_id,up_total_bought,up_reorder_ratio,number_times_last5,total_items,total_distinct_items,user_average_days_between_orders,user_number_orders,user_avg_basket,user_total_orders,user_reordered_ratio,product_orders_total,product_reorder_total,product_reorder_rate,aisle_id,department_id,eval_set,order_id
0,1,196,10,1.0,5.0,59,18,19.0,11,5.363,10,0.695,35791,27791.0,0.7764,77,7,train,1187899
1,1,10258,9,1.0,5.0,59,18,19.0,11,5.363,10,0.695,1946,1389.0,0.714,117,19,train,1187899
2,1,10326,1,0.1666,0.0,59,18,19.0,11,5.363,10,0.695,5526,3603.0,0.652,24,4,train,1187899
3,1,12427,10,1.0,5.0,59,18,19.0,11,5.363,10,0.695,6476,4797.0,0.7407,23,19,train,1187899
4,1,13032,3,0.3333,2.0,59,18,19.0,11,5.363,10,0.695,3751,2465.0,0.657,121,14,train,1187899


##3.1Create Train DataFrame


In [None]:
#Filter eval_set for just train to create the train DataFrame
new_features_train = reduce_mem_usage(new_features[new_features.eval_set=='train'])
new_features_train.head(n=10)

Memory usage of dataframe is 1293.13 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Memory usage after optimization is: 412.19 MB
Decreased by 68.1%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,user_id,product_id,up_total_bought,up_reorder_ratio,number_times_last5,total_items,total_distinct_items,user_average_days_between_orders,user_number_orders,user_avg_basket,user_total_orders,user_reordered_ratio,product_orders_total,product_reorder_total,product_reorder_rate,aisle_id,department_id,eval_set,order_id
0,1,196,10,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,35791,27791.0,0.776367,77,7,train,1187899
1,1,10258,9,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,1946,1389.0,0.713867,117,19,train,1187899
2,1,10326,1,0.166626,0.0,59,18,19.0,11,5.363281,10,0.694824,5526,3603.0,0.651855,24,4,train,1187899
3,1,12427,10,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,6476,4797.0,0.740723,23,19,train,1187899
4,1,13032,3,0.333252,2.0,59,18,19.0,11,5.363281,10,0.694824,3751,2465.0,0.657227,121,14,train,1187899
5,1,13176,2,0.222168,0.0,59,18,19.0,11,5.363281,10,0.694824,379450,315913.0,0.83252,24,4,train,1187899
6,1,14084,1,0.099976,0.0,59,18,19.0,11,5.363281,10,0.694824,15935,12923.0,0.811035,91,16,train,1187899
7,1,17122,1,0.166626,0.0,59,18,19.0,11,5.363281,10,0.694824,13880,9377.0,0.675781,24,4,train,1187899
8,1,25133,8,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,6196,4586.0,0.740234,21,16,train,1187899
9,1,26088,2,0.199951,0.0,59,18,19.0,11,5.363281,10,0.694824,2523,1360.0,0.539062,23,19,train,1187899


In [None]:
#Get the features from the order_products_train DataFrame
new_features_train = reduce_mem_usage(new_features_train.merge(order_products__train[['product_id','order_id','reordered']], on=['product_id', 'order_id'], how='left'))
new_features_train.head(n=10)


Memory usage of dataframe is 476.84 MB
Memory usage after optimization is: 428.35 MB
Decreased by 10.2%


Unnamed: 0,user_id,product_id,up_total_bought,up_reorder_ratio,number_times_last5,total_items,total_distinct_items,user_average_days_between_orders,user_number_orders,user_avg_basket,user_total_orders,user_reordered_ratio,product_orders_total,product_reorder_total,product_reorder_rate,aisle_id,department_id,eval_set,order_id,reordered
0,1,196,10,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,35791,27791.0,0.776367,77,7,train,1187899,1.0
1,1,10258,9,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,1946,1389.0,0.713867,117,19,train,1187899,1.0
2,1,10326,1,0.166626,0.0,59,18,19.0,11,5.363281,10,0.694824,5526,3603.0,0.651855,24,4,train,1187899,
3,1,12427,10,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,6476,4797.0,0.740723,23,19,train,1187899,
4,1,13032,3,0.333252,2.0,59,18,19.0,11,5.363281,10,0.694824,3751,2465.0,0.657227,121,14,train,1187899,1.0
5,1,13176,2,0.222168,0.0,59,18,19.0,11,5.363281,10,0.694824,379450,315913.0,0.83252,24,4,train,1187899,
6,1,14084,1,0.099976,0.0,59,18,19.0,11,5.363281,10,0.694824,15935,12923.0,0.811035,91,16,train,1187899,
7,1,17122,1,0.166626,0.0,59,18,19.0,11,5.363281,10,0.694824,13880,9377.0,0.675781,24,4,train,1187899,
8,1,25133,8,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,6196,4586.0,0.740234,21,16,train,1187899,1.0
9,1,26088,2,0.199951,0.0,59,18,19.0,11,5.363281,10,0.694824,2523,1360.0,0.539062,23,19,train,1187899,1.0


In [None]:
#Fill in 'reordered' column where NaN
new_features_train['reordered'] = new_features_train['reordered'].fillna(0)
new_features_train.head(n=10)

Unnamed: 0,user_id,product_id,up_total_bought,up_reorder_ratio,number_times_last5,total_items,total_distinct_items,user_average_days_between_orders,user_number_orders,user_avg_basket,user_total_orders,user_reordered_ratio,product_orders_total,product_reorder_total,product_reorder_rate,aisle_id,department_id,eval_set,order_id,reordered
0,1,196,10,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,35791,27791.0,0.776367,77,7,train,1187899,1.0
1,1,10258,9,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,1946,1389.0,0.713867,117,19,train,1187899,1.0
2,1,10326,1,0.166626,0.0,59,18,19.0,11,5.363281,10,0.694824,5526,3603.0,0.651855,24,4,train,1187899,0.0
3,1,12427,10,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,6476,4797.0,0.740723,23,19,train,1187899,0.0
4,1,13032,3,0.333252,2.0,59,18,19.0,11,5.363281,10,0.694824,3751,2465.0,0.657227,121,14,train,1187899,1.0
5,1,13176,2,0.222168,0.0,59,18,19.0,11,5.363281,10,0.694824,379450,315913.0,0.83252,24,4,train,1187899,0.0
6,1,14084,1,0.099976,0.0,59,18,19.0,11,5.363281,10,0.694824,15935,12923.0,0.811035,91,16,train,1187899,0.0
7,1,17122,1,0.166626,0.0,59,18,19.0,11,5.363281,10,0.694824,13880,9377.0,0.675781,24,4,train,1187899,0.0
8,1,25133,8,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,6196,4586.0,0.740234,21,16,train,1187899,1.0
9,1,26088,2,0.199951,0.0,59,18,19.0,11,5.363281,10,0.694824,2523,1360.0,0.539062,23,19,train,1187899,1.0


In [None]:
#Create a unique index that describe each row using user_id and product_id
new_features_train = new_features_train.set_index(['user_id', 'product_id'])
new_features_train.head(n=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,up_total_bought,up_reorder_ratio,number_times_last5,total_items,total_distinct_items,user_average_days_between_orders,user_number_orders,user_avg_basket,user_total_orders,user_reordered_ratio,product_orders_total,product_reorder_total,product_reorder_rate,aisle_id,department_id,eval_set,order_id,reordered
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,196,10,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,35791,27791.0,0.776367,77,7,train,1187899,1.0
1,10258,9,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,1946,1389.0,0.713867,117,19,train,1187899,1.0
1,10326,1,0.166626,0.0,59,18,19.0,11,5.363281,10,0.694824,5526,3603.0,0.651855,24,4,train,1187899,0.0
1,12427,10,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,6476,4797.0,0.740723,23,19,train,1187899,0.0
1,13032,3,0.333252,2.0,59,18,19.0,11,5.363281,10,0.694824,3751,2465.0,0.657227,121,14,train,1187899,1.0
1,13176,2,0.222168,0.0,59,18,19.0,11,5.363281,10,0.694824,379450,315913.0,0.83252,24,4,train,1187899,0.0
1,14084,1,0.099976,0.0,59,18,19.0,11,5.363281,10,0.694824,15935,12923.0,0.811035,91,16,train,1187899,0.0
1,17122,1,0.166626,0.0,59,18,19.0,11,5.363281,10,0.694824,13880,9377.0,0.675781,24,4,train,1187899,0.0
1,25133,8,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,6196,4586.0,0.740234,21,16,train,1187899,1.0
1,26088,2,0.199951,0.0,59,18,19.0,11,5.363281,10,0.694824,2523,1360.0,0.539062,23,19,train,1187899,1.0


In [None]:
#Remove features which are not predictors
new_features_train = new_features_train.drop(['eval_set', 'order_id'], axis=1)
new_features_train.head(n=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,up_total_bought,up_reorder_ratio,number_times_last5,total_items,total_distinct_items,user_average_days_between_orders,user_number_orders,user_avg_basket,user_total_orders,user_reordered_ratio,product_orders_total,product_reorder_total,product_reorder_rate,aisle_id,department_id,reordered
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,196,10,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,35791,27791.0,0.776367,77,7,1.0
1,10258,9,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,1946,1389.0,0.713867,117,19,1.0
1,10326,1,0.166626,0.0,59,18,19.0,11,5.363281,10,0.694824,5526,3603.0,0.651855,24,4,0.0
1,12427,10,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,6476,4797.0,0.740723,23,19,0.0
1,13032,3,0.333252,2.0,59,18,19.0,11,5.363281,10,0.694824,3751,2465.0,0.657227,121,14,1.0
1,13176,2,0.222168,0.0,59,18,19.0,11,5.363281,10,0.694824,379450,315913.0,0.83252,24,4,0.0
1,14084,1,0.099976,0.0,59,18,19.0,11,5.363281,10,0.694824,15935,12923.0,0.811035,91,16,0.0
1,17122,1,0.166626,0.0,59,18,19.0,11,5.363281,10,0.694824,13880,9377.0,0.675781,24,4,0.0
1,25133,8,1.0,5.0,59,18,19.0,11,5.363281,10,0.694824,6196,4586.0,0.740234,21,16,1.0
1,26088,2,0.199951,0.0,59,18,19.0,11,5.363281,10,0.694824,2523,1360.0,0.539062,23,19,1.0


In [None]:
new_features_train.shape

(8474661, 16)

##3.2 Create Test DataFrame

In [None]:
new_features_test = reduce_mem_usage(new_features[new_features.eval_set == 'test'])
new_features_test.head()

Memory usage of dataframe is 737.50 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Memory usage after optimization is: 235.08 MB
Decreased by 68.1%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,user_id,product_id,up_total_bought,up_reorder_ratio,number_times_last5,total_items,total_distinct_items,user_average_days_between_orders,user_number_orders,user_avg_basket,user_total_orders,user_reordered_ratio,product_orders_total,product_reorder_total,product_reorder_rate,aisle_id,department_id,eval_set,order_id
120,3,248,1,0.090881,0.0,88,33,12.0,13,6.769531,12,0.625,6371,2550.0,0.400146,117,19,test,2774568
121,3,1005,1,0.333252,1.0,88,33,12.0,13,6.769531,12,0.625,463,204.0,0.440674,94,7,test,2774568
122,3,1819,3,0.333252,0.0,88,33,12.0,13,6.769531,12,0.625,2424,1193.0,0.492188,88,13,test,2774568
123,3,7503,1,0.099976,0.0,88,33,12.0,13,6.769531,12,0.625,12474,6905.0,0.553711,117,19,test,2774568
124,3,8021,1,0.090881,0.0,88,33,12.0,13,6.769531,12,0.625,27864,16472.0,0.591309,54,17,test,2774568


In [None]:
#Create a unique index for the dataframe using user_id and product_id
new_features_test = new_features_test.set_index(['user_id', 'product_id'])
new_features.head()

Unnamed: 0,user_id,product_id,up_total_bought,up_reorder_ratio,number_times_last5,total_items,total_distinct_items,user_average_days_between_orders,user_number_orders,user_avg_basket,user_total_orders,user_reordered_ratio,product_orders_total,product_reorder_total,product_reorder_rate,aisle_id,department_id,eval_set,order_id
0,1,196,10,1.0,5.0,59,18,19.0,11,5.363,10,0.695,35791,27791.0,0.7764,77,7,train,1187899
1,1,10258,9,1.0,5.0,59,18,19.0,11,5.363,10,0.695,1946,1389.0,0.714,117,19,train,1187899
2,1,10326,1,0.1666,0.0,59,18,19.0,11,5.363,10,0.695,5526,3603.0,0.652,24,4,train,1187899
3,1,12427,10,1.0,5.0,59,18,19.0,11,5.363,10,0.695,6476,4797.0,0.7407,23,19,train,1187899
4,1,13032,3,0.3333,2.0,59,18,19.0,11,5.363,10,0.695,3751,2465.0,0.657,121,14,train,1187899


In [None]:
#Remove features which are not predictors
new_features_test = new_features_test.drop(['eval_set', 'order_id'], axis=1)


In [None]:
new_features_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,up_total_bought,up_reorder_ratio,number_times_last5,total_items,total_distinct_items,user_average_days_between_orders,user_number_orders,user_avg_basket,user_total_orders,user_reordered_ratio,product_orders_total,product_reorder_total,product_reorder_rate,aisle_id,department_id
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
3,248,1,0.090881,0.0,88,33,12.0,13,6.769531,12,0.625,6371,2550.0,0.400146,117,19
3,1005,1,0.333252,1.0,88,33,12.0,13,6.769531,12,0.625,463,204.0,0.440674,94,7
3,1819,3,0.333252,0.0,88,33,12.0,13,6.769531,12,0.625,2424,1193.0,0.492188,88,13
3,7503,1,0.099976,0.0,88,33,12.0,13,6.769531,12,0.625,12474,6905.0,0.553711,117,19
3,8021,1,0.090881,0.0,88,33,12.0,13,6.769531,12,0.625,27864,16472.0,0.591309,54,17


In [None]:
new_features_test.shape


(4833292, 15)

In [None]:
new_features_test.shape
new_features_test.info

<bound method DataFrame.info of                     up_total_bought  up_reorder_ratio  ...  aisle_id  department_id
user_id product_id                                     ...                         
3       248                       1          0.090881  ...       117             19
        1005                      1          0.333252  ...        94              7
        1819                      3          0.333252  ...        88             13
        7503                      1          0.099976  ...       117             19
        8021                      1          0.090881  ...        54             17
...                             ...               ...  ...       ...            ...
206208  48364                     1          0.021271  ...        83              4
        48865                     1          0.125000  ...       100             21
        49247                     1          0.022720  ...       120             16
        49385                     1         

#Split into Train and Test

In [None]:
#Further split training set (new_features_train DataFrame) into train and test set to validate our model
from sklearn.model_selection import train_test_split, StratifiedKFold

In [None]:

#Split into smaller DataFrame for ease of working with and faster processing time
new_features_train2 = new_features_train.sample(n=50000)

In [None]:
X_train, X_test = train_test_split(new_features_train2, test_size=0.3, random_state=42)

In [None]:
X_train.shape
X_train.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 35000 entries, (83253, 24830) to (167376, 24830)
Data columns (total 16 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   up_total_bought                   35000 non-null  int8   
 1   up_reorder_ratio                  35000 non-null  float16
 2   number_times_last5                35000 non-null  float16
 3   total_items                       35000 non-null  int16  
 4   total_distinct_items              35000 non-null  int16  
 5   user_average_days_between_orders  35000 non-null  float16
 6   user_number_orders                35000 non-null  int8   
 7   user_avg_basket                   35000 non-null  float16
 8   user_total_orders                 35000 non-null  int8   
 9   user_reordered_ratio              35000 non-null  float16
 10  product_orders_total              35000 non-null  int32  
 11  product_reorder_total             35000 non-

In [None]:
X_test.shape
X_test.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 15000 entries, (187175, 47626) to (42724, 16454)
Data columns (total 16 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   up_total_bought                   15000 non-null  int8   
 1   up_reorder_ratio                  15000 non-null  float16
 2   number_times_last5                15000 non-null  float16
 3   total_items                       15000 non-null  int16  
 4   total_distinct_items              15000 non-null  int16  
 5   user_average_days_between_orders  15000 non-null  float16
 6   user_number_orders                15000 non-null  int8   
 7   user_avg_basket                   15000 non-null  float16
 8   user_total_orders                 15000 non-null  int8   
 9   user_reordered_ratio              15000 non-null  float16
 10  product_orders_total              15000 non-null  int32  
 11  product_reorder_total             15000 non-

In [None]:
#Split the target from the features
X_train_features = X_train.drop(['reordered'], axis=1)
Y_train_target = X_train['reordered']

In [None]:
#Prepare test data
#Split feature and target
X_test_features = X_test.drop(['reordered'], axis=1)
Y_test_target = X_test['reordered']

In [None]:
X_train = X_train_features.copy()
y_train = Y_train_target.copy()
X_test = X_test_features.copy()
y_test = Y_test_target.copy()

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(35000, 15)
(35000,)
(15000, 15)
(15000,)


#Random Forest Classifer

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
#clf = RandomForestClassifier(random_state=1, n_estimators=100, class_weight='balanced',criterion='gini',max_depth=5,min_samples_leaf=1)
clf_base = RandomForestClassifier(random_state=1,class_weight='balanced',criterion='gini' )

In [None]:
clf_base.fit(X_train_features, Y_train_target)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [None]:
y_pred_base = clf_base.predict(X_test_features).astype(int)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
accuracy       = accuracy_score(Y_test_target, y_pred_base)
f1             = f1_score(Y_test_target, y_pred_base,average='macro')
recall         = recall_score(Y_test_target, y_pred_base,average='macro')
precision      = precision_score(Y_test_target, y_pred_base,average='macro')
#roc_auc        = roc_auc_score(Y_test_target, y_pred,average='average',multi_class='ovr')

df = pd.DataFrame({"Accuracy"  : [accuracy],
                   "Recall"    : [recall],
                   "Precision" : [precision],
                   "F1"        : [f1]})
       #           "AUC"       : [roc_auc],
       #               })
    
print(df)

   Accuracy    Recall  Precision        F1
0  0.908067  0.566413   0.777958  0.591431


In [None]:
#Grid Search
from sklearn.model_selection import GridSearchCV

In [None]:
#Define how to train the different models
clf_tune = RandomForestClassifier(random_state=1,class_weight='balanced',criterion='gini' )

params = {'bootstrap': [True], 
          'n_estimators': [100, 200],
          'max_depth': [5, 10]
          }

search = GridSearchCV(clf_tune, params, scoring='f1_macro', cv=4, verbose=1, n_jobs = 5)

In [None]:
#Train the models
search = search.fit(X_train_features, Y_train_target)

Fitting 4 folds for each of 4 candidates, totalling 16 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  16 out of  16 | elapsed:   23.8s finished


In [None]:
#The model has now the new parameters from GridSearchCV:
search.get_params()

{'cv': 4,
 'error_score': nan,
 'estimator': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=1, verbose=0,
                        warm_start=False),
 'estimator__bootstrap': True,
 'estimator__ccp_alpha': 0.0,
 'estimator__class_weight': 'balanced',
 'estimator__criterion': 'gini',
 'estimator__max_depth': None,
 'estimator__max_features': 'auto',
 'estimator__max_leaf_nodes': None,
 'estimator__max_samples': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_impurity_split': None,
 'estimator__min_samples_leaf': 1,
 'estimato

In [None]:
y_pred_tune = search.predict(new_features_test).astype(int)

In [None]:
#Save the prediction on a new column in the data_test DataFrame
new_features_test['prediction'] = y_pred_tune
new_features_test.head(n=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,up_total_bought,up_reorder_ratio,number_times_last5,total_items,total_distinct_items,user_average_days_between_orders,user_number_orders,user_avg_basket,user_total_orders,user_reordered_ratio,product_orders_total,product_reorder_total,product_reorder_rate,aisle_id,department_id,prediction
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
3,248,1,0.090881,0.0,88,33,12.0,13,6.769531,12,0.625,6371,2550.0,0.400146,117,19,0
3,1005,1,0.333252,1.0,88,33,12.0,13,6.769531,12,0.625,463,204.0,0.440674,94,7,0
3,1819,3,0.333252,0.0,88,33,12.0,13,6.769531,12,0.625,2424,1193.0,0.492188,88,13,0
3,7503,1,0.099976,0.0,88,33,12.0,13,6.769531,12,0.625,12474,6905.0,0.553711,117,19,0
3,8021,1,0.090881,0.0,88,33,12.0,13,6.769531,12,0.625,27864,16472.0,0.591309,54,17,0
3,9387,5,0.416748,0.0,88,33,12.0,13,6.769531,12,0.625,36187,23537.0,0.650391,24,4,0
3,12845,1,0.111084,0.0,88,33,12.0,13,6.769531,12,0.625,10027,3639.0,0.363037,117,19,0
3,14992,2,0.285645,0.0,88,33,12.0,13,6.769531,12,0.625,29069,16942.0,0.583008,83,4,0
3,15143,1,0.083313,0.0,88,33,12.0,13,6.769531,12,0.625,3447,1696.0,0.491943,24,4,0
3,16797,3,0.25,1.0,88,33,12.0,13,6.769531,12,0.625,142951,99802.0,0.698242,24,4,0


In [None]:
#Reset the index
final_predict = new_features_test.reset_index()

In [None]:
#Keep only the required columns to create submission final
final_predict = final_predict[['product_id', 'user_id', 'prediction']]
gc.collect()
final_predict.head()

Unnamed: 0,product_id,user_id,prediction
0,248,3,0
1,1005,3,0
2,1819,3,0
3,7503,3,0
4,8021,3,0


In [None]:
#For each user_id in the test DataFrame, we need to get the order numbers and the products that were predicted to be purchased
orders_test = orders.loc[orders.eval_set=='test', ("user_id", "order_id")]
orders_test.head(n=10)

In [None]:
final_predict = final_predict.merge(orders_test, on='user_id', how='left')
final_predict.head()

In [None]:
#Drop unnecessary columns
final_predict = final_predict.drop('user_id', axis=1)

In [None]:
#Change product_id to integer
final_predict['product_id'] = final_predict.product_id.astype(int)

In [None]:
#Remove unecessary DataFrames
del orders
del orders_test
gc.collect()

final_predict.head()

In [None]:
#For the submission file, create a dictionary which will be used to create a map of unique keys to values.
#Index will be the order_id and the values will be the all the products that the order will have
#If none of the products will be purchased, use the string "None."

d = dict()
for row in final_predict.itertuples():
    if row.prediction== 1:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in final_predict.order_id:
    if order not in d:
        d[order] = 'None'
        
gc.collect()

#We now check how the dictionary were populated (open hidden output)
d

In [None]:
#Convert the dictionary into a DataFrame
submission = pd.DataFrame.from_dict(d, orient='index')

#Reset index
submission.reset_index(inplace=True)

#Set column names
submission.columns = ['order_id', 'products']

submission.head(n=10)

In [None]:
#Check if the submission file has all 75,000 predictions
submission.shape[0]

In [None]:
#Export to CSV
submission.to_csv('submission8.csv', index=False)
files.download('submission8.csv')

In [None]:
#Export to CSV
submission.to_csv('submission8.csv', index=False)