### Import the main libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from ast import literal_eval

### Load and prepare the data

In [2]:
# load the train data set
df_train = pd.read_csv('train.csv', converters = {'Product_Holding_B1' : literal_eval, 'Product_Holding_B2' : literal_eval})
df_train.head()

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1,Product_Holding_B2
0,CC264719,Male,41,14,0,C1,S3,[P16],[P8]
1,CC209679,Female,47,14,1,C1,S2,"[P13, P20]",[P3]
2,CC319633,Female,59,14,0,C2,S2,[P11],[P00]
3,CC231413,Female,32,16,0,C1,S2,"[P8, P13]",[P6]
4,CC259633,Male,30,15,0,C2,S3,"[P16, P17, P21]","[P8, P12]"


In [3]:
df_train.dtypes

Customer_ID           object
Gender                object
Age                    int64
Vintage                int64
Is_Active              int64
City_Category         object
Customer_Category     object
Product_Holding_B1    object
Product_Holding_B2    object
dtype: object

In [4]:
df_train.shape

(37748, 9)

In [5]:
df_train.isnull().sum()

Customer_ID           0
Gender                0
Age                   0
Vintage               0
Is_Active             0
City_Category         0
Customer_Category     0
Product_Holding_B1    0
Product_Holding_B2    0
dtype: int64

In [6]:
df_train['Gender_n'] = df_train['Gender'].astype('category').cat.codes
df_train['City_Category_n'] = df_train['City_Category'].astype('category').cat.codes

In [7]:
df_train = pd.concat((df_train, pd.get_dummies(df_train['Customer_Category'], prefix = 'Customer', drop_first = True)), axis = 1)

In [8]:
product_list = np.unique(df_train['Product_Holding_B1'])
product_list_n = np.unique(sum(product_list, []))
for product_n in product_list_n:
    df_train['B1_' + product_n] = 0

In [9]:
for row in range(df_train.shape[0]):
    for prod in df_train.loc[row, 'Product_Holding_B1']:
        df_train.loc[row, ['B1_' + prod]] = 1

In [10]:
product_list = np.unique(df_train['Product_Holding_B2'])
product_list_n = np.unique(sum(product_list, []))
for product_n in product_list_n:
    df_train['B2_' + product_n] = 0

In [11]:
for row in range(df_train.shape[0]):
    for prod in df_train.loc[row, 'Product_Holding_B2']:
        df_train.loc[row, ['B2_' + prod]] = 1

In [12]:
df_train.head()

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1,Product_Holding_B2,Gender_n,...,B2_P18,B2_P2,B2_P20,B2_P3,B2_P4,B2_P5,B2_P6,B2_P7,B2_P8,B2_P9
0,CC264719,Male,41,14,0,C1,S3,[P16],[P8],1,...,0,0,0,0,0,0,0,0,1,0
1,CC209679,Female,47,14,1,C1,S2,"[P13, P20]",[P3],0,...,0,0,0,1,0,0,0,0,0,0
2,CC319633,Female,59,14,0,C2,S2,[P11],[P00],0,...,0,0,0,0,0,0,0,0,0,0
3,CC231413,Female,32,16,0,C1,S2,"[P8, P13]",[P6],0,...,0,0,0,0,0,0,1,0,0,0
4,CC259633,Male,30,15,0,C2,S3,"[P16, P17, P21]","[P8, P12]",1,...,0,0,0,0,0,0,0,0,1,0


In [13]:
df_train = df_train.drop(columns = ['Gender', 'City_Category', 'Customer_Category', 'Product_Holding_B1', 'Product_Holding_B2'])
df_train.head()

Unnamed: 0,Customer_ID,Age,Vintage,Is_Active,Gender_n,City_Category_n,Customer_S2,Customer_S3,B1_P00,B1_P1,...,B2_P18,B2_P2,B2_P20,B2_P3,B2_P4,B2_P5,B2_P6,B2_P7,B2_P8,B2_P9
0,CC264719,41,14,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,CC209679,47,14,1,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,CC319633,59,14,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CC231413,32,16,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,CC259633,30,15,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
B2_prods = [col_name for col_name in df_train.columns if col_name.startswith('B2')]

In [16]:
X = df_train.drop(columns = B2_prods + ['Customer_ID'])
Y = df_train[B2_prods]

In [17]:
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size = 0.2, random_state = 1)

### Model building

### Random Forest

In [18]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import label_ranking_average_precision_score

In [320]:
clf = RandomForestClassifier(random_state=1, n_estimators = 400, max_depth = 12, max_features = 'log2', min_samples_split = 3,
                            min_samples_leaf = 3)
multilabel_clf = MultiOutputClassifier(clf, n_jobs=-1)

In [None]:
multilabel_clf.fit(x_train, y_train)

In [None]:
y_pred = multilabel_clf.predict(x_valid)

In [318]:
label_ranking_average_precision_score(y_pred, y_valid)

0.8592720487753636

### Prepare the test dataset and make the prediction

In [300]:
# load the test data
df_test = pd.read_csv('test.csv', converters = {'Product_Holding_B1' : literal_eval, 'Product_Holding_B2' : literal_eval})
df_test.head()

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1
0,CC372708,Female,31,31,0,C2,S3,"[P12, P13]"
1,CC216072,Male,28,37,1,C1,S2,"[P12, P13]"
2,CC387629,Male,31,12,0,C2,S3,[P20]
3,CC389228,Female,55,11,0,C2,S2,"[P13, P21]"
4,CC394445,Male,51,49,1,C2,S1,[P13]


In [301]:
df_test.shape

(20327, 8)

In [302]:
df_test.isnull().sum()

Customer_ID           0
Gender                0
Age                   0
Vintage               0
Is_Active             0
City_Category         0
Customer_Category     0
Product_Holding_B1    0
dtype: int64

In [303]:
df_test['Gender_n'] = df_test['Gender'].astype('category').cat.codes
df_test['City_Category_n'] = df_test['City_Category'].astype('category').cat.codes

In [304]:
df_test = pd.concat((df_test, pd.get_dummies(df_test['Customer_Category'], prefix = 'Customer', drop_first = True)), axis = 1)

In [305]:
product_list = np.unique(df_test['Product_Holding_B1'])
product_list_n = np.unique(sum(product_list, []))
for product_n in product_list_n:
    df_test['B1_' + product_n] = 0

In [306]:
for row in range(df_test.shape[0]):
    for prod in df_test.loc[row, 'Product_Holding_B1']:
        df_test.loc[row, ['B1_' + prod]] = 1

In [307]:
df_test.head()

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1,Gender_n,City_Category_n,...,B1_P2,B1_P20,B1_P21,B1_P3,B1_P4,B1_P5,B1_P6,B1_P7,B1_P8,B1_P9
0,CC372708,Female,31,31,0,C2,S3,"[P12, P13]",0,1,...,0,0,0,0,0,0,0,0,0,0
1,CC216072,Male,28,37,1,C1,S2,"[P12, P13]",1,0,...,0,0,0,0,0,0,0,0,0,0
2,CC387629,Male,31,12,0,C2,S3,[P20],1,1,...,0,1,0,0,0,0,0,0,0,0
3,CC389228,Female,55,11,0,C2,S2,"[P13, P21]",0,1,...,0,0,1,0,0,0,0,0,0,0
4,CC394445,Male,51,49,1,C2,S1,[P13],1,1,...,0,0,0,0,0,0,0,0,0,0


In [308]:
x_test = df_test.drop(columns = ['Customer_ID', 'Gender', 'City_Category', 'Customer_Category', 'Product_Holding_B1'])

In [309]:
y_pred_prob = multilabel_clf.predict_proba(x_test)

In [310]:
for i, col in enumerate(B2_prods):
    df_test[col] = y_pred_prob[i][:, 1]

In [311]:
df_test.head()

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1,Gender_n,City_Category_n,...,B2_P18,B2_P2,B2_P20,B2_P3,B2_P4,B2_P5,B2_P6,B2_P7,B2_P8,B2_P9
0,CC372708,Female,31,31,0,C2,S3,"[P12, P13]",0,1,...,0.0,0.000899,0.0,0.12465,0.157478,0.04123,0.103707,0.07661,0.524969,0.068544
1,CC216072,Male,28,37,1,C1,S2,"[P12, P13]",1,0,...,0.0,0.001181,0.0,0.150667,0.134841,0.037218,0.081449,0.072999,0.502623,0.055561
2,CC387629,Male,31,12,0,C2,S3,[P20],1,1,...,0.0,0.003589,1.1e-05,0.01114,0.01728,0.008879,0.049178,0.015696,0.115832,0.020109
3,CC389228,Female,55,11,0,C2,S2,"[P13, P21]",0,1,...,0.0,0.001394,3.6e-05,0.050911,0.053713,0.026827,0.089185,0.02641,0.325876,0.0916
4,CC394445,Male,51,49,1,C2,S1,[P13],1,1,...,0.0,0.000261,0.0,0.067911,0.020948,0.007742,0.053603,0.028086,0.247822,0.029088


In [312]:
df_test['Product_Holding_B2'] = [[] for _ in range(len(df_test))]
df_test.head()

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1,Gender_n,City_Category_n,...,B2_P2,B2_P20,B2_P3,B2_P4,B2_P5,B2_P6,B2_P7,B2_P8,B2_P9,Product_Holding_B2
0,CC372708,Female,31,31,0,C2,S3,"[P12, P13]",0,1,...,0.000899,0.0,0.12465,0.157478,0.04123,0.103707,0.07661,0.524969,0.068544,[]
1,CC216072,Male,28,37,1,C1,S2,"[P12, P13]",1,0,...,0.001181,0.0,0.150667,0.134841,0.037218,0.081449,0.072999,0.502623,0.055561,[]
2,CC387629,Male,31,12,0,C2,S3,[P20],1,1,...,0.003589,1.1e-05,0.01114,0.01728,0.008879,0.049178,0.015696,0.115832,0.020109,[]
3,CC389228,Female,55,11,0,C2,S2,"[P13, P21]",0,1,...,0.001394,3.6e-05,0.050911,0.053713,0.026827,0.089185,0.02641,0.325876,0.0916,[]
4,CC394445,Male,51,49,1,C2,S1,[P13],1,1,...,0.000261,0.0,0.067911,0.020948,0.007742,0.053603,0.028086,0.247822,0.029088,[]


In [313]:
for row in range(df_test.shape[0]):
    indices = df_test.loc[row, B2_prods].sort_values(ascending = False)[0:3].index
    for index in indices:
            s = index.split('_')[-1]
            df_test.loc[row, 'Product_Holding_B2'].append(s)

In [314]:
df_test[['Customer_ID', 'Product_Holding_B2']].to_csv('submission.csv', index=False)