In [None]:
# https://www.kaggle.com/xingobar/random-forest-classifier/code
# Original Version - Private score: 0.0232244; Public score: 0.0230549

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
import time


# 選取需要的欄位
targetcols = ['ncodpers', 'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
       'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
       'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
       'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']

dtype_list = {'ind_cco_fin_ult1': 'float16',
              'ind_deme_fin_ult1': 'float16',
              'ind_aval_fin_ult1': 'float16',
              'ind_valo_fin_ult1': 'float16',
              'ind_reca_fin_ult1': 'float16',
              'ind_ctju_fin_ult1': 'float16',
              'ind_cder_fin_ult1': 'float16', 
              'ind_plan_fin_ult1': 'float16',
              'ind_fond_fin_ult1': 'float16', 
              'ind_hip_fin_ult1': 'float16',
              'ind_pres_fin_ult1': 'float16', 
              'ind_nomina_ult1': 'float16', 
              'ind_cno_fin_ult1': 'float16',
              'ncodpers': 'int64',
              'ind_ctpp_fin_ult1': 'float16',
              'ind_ahor_fin_ult1': 'float16',
              'ind_dela_fin_ult1': 'float16',
              'ind_ecue_fin_ult1': 'float16',
              'ind_nom_pens_ult1': 'float16',
              'ind_recibo_ult1': 'float16',
              'ind_deco_fin_ult1': 'float16',
              'ind_tjcr_fin_ult1': 'float16', 
              'ind_ctop_fin_ult1': 'float16',
              'ind_viv_fin_ult1': 'float16',
              'ind_ctma_fin_ult1': 'float16'}       

# 挑選的因子
feature_cols = ["ind_empleado","pais_residencia","sexo",
                "age", "ind_nuevo", "antiguedad", "nomprov",
                "segmento"]

train_file = 'train_ver2.csv'
test_file = 'test_ver2.csv'
train_size = 13647309
nrows = 1000000
start_idx = train_size - nrows

# 資料處理工程（補值等資料清理動作）
for idx,col in enumerate(feature_cols):
    
    start_time = time.time()
    train = pd.read_csv(train_file,usecols=[col])
    test = pd.read_csv(test_file,usecols=[col])
    print(col)
    ### data preprocessing
    if col == 'age':
        train[col] = pd.to_numeric(train[col],errors='coerce')
        test[col] = pd.to_numeric(test[col],errors='coerce')
        
        train.loc[train.age < 18,"age"]  = train.loc[(train.age >= 18) & (train.age <= 30),"age"].mean(skipna=True)
        test.loc[test.age > 100,"age"] = test.loc[(test.age >= 30) & (test.age <= 100),"age"].mean(skipna=True)
        
        train['age'].fillna(train['age'].mean(),inplace=True)
        test['age'].fillna(test['age'].mean(),inplace=True)
        train['age'] = train['age'].astype(int)
        test['age']= test['age'].astype(int)
        
        
    elif col == 'ind_nuevo':
       train.loc[train[col].isnull(),col] = 1
       test.loc[test[col].isnull(),col] = 1
    elif col == 'antiguedad':
        train[col] = pd.to_numeric(train[col],errors='coerce')
        test[col] = pd.to_numeric(test[col],errors = 'coerce')
        train.loc[train[col].isnull(),col] = train[col].min()
        train.loc[train[col] < 0 , col] = 0 
        test.loc[test[col].isnull(),col] = test[col].min()
        test.loc[test[col] <0 ,col] = 0
    elif col =='nomprov':
        train[col].fillna('Unknown',inplace=True)
        test[col].fillna('Unknown',inplace=True)
    elif col =='segmento':
        train[col] = train[col].apply(lambda x:str(x).split('-')[0])
        test[col] = test[col].apply(lambda x:str(x).split('-')[0])
        train.loc[train[col].isnull(),col] = 'Unknown'
        test.loc[test[col].isnull(),col] = 'Unknown'
    else:
        train[col].fillna(-999,inplace=True)
        test[col].fillna(-999,inplace=True)
    ##### 
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].values) + list(test[col].values))
        temp_train = le.transform(list(train[col].values)).reshape(-1,1)[start_idx:,:]
        temp_test = le.transform(list(test[col].values)).reshape(-1,1)
    else:
        temp_train = np.array(train[col]).reshape(-1,1)[start_idx:,:]
        temp_test = np.array(test[col]).reshape(-1,1)
    if idx == 0:
        x_train = temp_train.copy()
        x_test = temp_test.copy()
    else:
        x_train = np.hstack([x_train,temp_train])
        x_test = np.hstack([x_test,temp_test])
    print(x_train.shape,x_test.shape)
    print('Time is %0.2f' %(time.time() - start_time))
    del train
    del test

y_train = pd.read_csv(train_file,usecols = targetcols,dtype=dtype_list)
last_instance = y_train.drop_duplicates(y_train,keep='last')

y_train = np.array(y_train.fillna(0)).astype('int')[start_idx:,1:]
print(x_train.shape,y_train.shape)


#Original Version

print('Running Model...')
clf = RandomForestClassifier(n_estimators=10,
                             max_depth=10,
                             n_jobs=-1,
                             random_state=42)

clf.fit(x_train,y_train)
del x_train
del y_train
print('Predicting....')
## [n_sample , n_class]
y_pred = np.array(clf.predict_proba(x_test))[:,:,1].T ## [n_class,n_sample]
del x_test



ind_empleado
((1000000, 1), (929615, 1))
Time is 31.84
pais_residencia
((1000000, 2), (929615, 2))
Time is 30.62
sexo
((1000000, 3), (929615, 3))
Time is 31.60


  interactivity=interactivity, compiler=compiler, result=result)


age
((1000000, 4), (929615, 4))
Time is 27.44
ind_nuevo
((1000000, 5), (929615, 5))
Time is 21.13


  interactivity=interactivity, compiler=compiler, result=result)


antiguedad
((1000000, 6), (929615, 6))
Time is 27.30
nomprov
((1000000, 7), (929615, 7))
Time is 40.67
segmento
((1000000, 8), (929615, 8))
Time is 40.38
((1000000, 8), (1000000, 24))


*-----------------------------------------------------------*

* Grid Search

In [None]:
#forest = RandomForestClassifier()

#parameters = {'n_estimators': [10, 20, 30], # The number of trees in the forest.
#              'criterion': ['entropy', 'gini'],     
#              'max_depth': [5, 10, 15, 20],        # The maximum depth of the tree. 
#              'min_samples_split': [2, 5, 10, 15, 20]  # The minimum number of samples required to split an internal node.
#             }
#acc_scorer = make_scorer(accuracy_score)
#grid = GridSearchCV(forest, parameters, scoring = acc_scorer)
#grid = grid.fit(x_train, y_train)
#print grid.best_estimator_

In [None]:
#forest = grid.best_estimator_
#forest.fit(x_train, y_train)
#y_pred = np.array(forest.predict_proba(x_test))[:,:,1].T

*-----------------------------------------------------------*

In [3]:
# 挑選產品出來（調整程式的寫法）
prod_cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
       'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
       'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
       'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']

In [4]:
print('Getting the last instance dictionary ....')
last_instance.fillna(0,inplace=True) # 25 cols
recommendation_product = {}
#targetcols = np.array(targetcols)  # 25 cols
targetcols = np.array(prod_cols)    # 24 cols

for idx,row_val in last_instance.iterrows():
    
    ids = row_val['ncodpers']
    used_product = set(targetcols[np.array(row_val[1:]) == 1])
    recommendation_product[ids] = used_product
del last_instance


Getting the last instance dictionary ....


In [5]:
# 產出最後推薦的結果
print('Submission ....')
## [n_class , n_sample]
pred = np.argsort(y_pred,axis=1) ## sort probability by axis 1 and return index
#print(pred)
pred = np.fliplr(pred) 
test_ids = np.array(pd.read_csv(test_file,usecols=['ncodpers'])['ncodpers'])
final_preds = []
for idx,predicted in enumerate(pred):
    ids = test_ids[idx]
    top_product = targetcols[predicted]
    used_product = recommendation_product.get(ids,[])
    new_top_product = []
    for product in top_product:
        if product not in used_product:
            new_top_product.append(product)
        if len(new_top_product) == 7:
            break
    final_preds.append(' '.join(new_top_product))
result = pd.DataFrame({'ncodpers':test_ids,'added_products':final_preds})
result.to_csv('submission_rf.csv',index=False)
print('Finish. Time is %0.2f' %(time.time() - start_time))

Submission ....
Finish. Time is 972.46


In [6]:
print y_pred

[[0.00000000e+00 5.05561173e-05 6.78685501e-01 ... 3.19945951e-01
  3.26134827e-01 7.45981937e-01]
 [0.00000000e+00 3.22190898e-06 5.29963456e-01 ... 8.19260053e-02
  8.54169331e-02 2.21892458e-01]
 [0.00000000e+00 0.00000000e+00 8.32436537e-01 ... 7.14754885e-03
  7.76483872e-03 3.48085941e-02]
 ...
 [2.47025573e-05 0.00000000e+00 5.14387493e-01 ... 3.41761417e-02
  3.68284130e-02 9.71346157e-02]
 [5.63063063e-05 0.00000000e+00 4.90819560e-01 ... 4.28741850e-02
  4.42364563e-02 1.08082880e-01]
 [2.67903576e-05 1.19329187e-05 4.96919719e-01 ... 2.22155332e-02
  3.30110823e-02 7.62527653e-02]]


In [7]:
print pred

[[23  2 19 ...  1  5  0]
 [ 2 23  4 ...  9  1  0]
 [ 2 23  4 ... 20  1  0]
 ...
 [ 2 23 12 ...  9  0  1]
 [ 2 23  4 ...  9  0  1]
 [ 2  7 23 ...  9  0  1]]
