網址：https://www.kaggle.com/anokas/collaborative-filtering-btb-lb-0-01691/code

1. Collaborative Filtering based model
    - 利用相似性做推薦
    - 用人還是用物品
    - User-Based
2. LogisticRegression classifier
    - to predict each product from the existing products a user has, essentially expressing each product as a linear combination of the other products. I then use this to generate probabilities for the test set, and select the top 7 most likely products that people haven't already purchased in the training set.

3. This is a very simple model, since it models only on the existing products that a user has, and no other attributes.

4. The score could pretty easily be raised by adding more features

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from collections import defaultdict

In [2]:
usecols = ['ncodpers', 'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
       'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
       'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
       'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']
# 顧客id 所有產品       
df_train = pd.read_csv('train_ver2.csv', usecols=usecols)
sample = pd.read_csv('sample_submission.csv')

In [3]:
df_train.shape
# 1千3百多萬位顧客(未distinct)

(13647309, 25)

In [5]:
df_train.head()

Unnamed: 0,ncodpers,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
0,1375586,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,0
1,1050611,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,0
2,1050612,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,0
3,1050613,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,0
4,1050614,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,0


In [6]:
df_train = df_train.drop_duplicates(['ncodpers'], keep='last')
df_train.shape
# 95萬位顧客(去重複、並取最新一筆)

df_train.fillna(0, inplace=True)
#NaN以0取代
#對於不知道就當作沒有持有產品 補零

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [7]:
#設置待會用到的東西
models = {}
model_preds = {}
id_preds = defaultdict(list)
ids = df_train['ncodpers'].values

In [8]:
for c in df_train.columns:
    
    if c != 'ncodpers':
        print(c)
        print("****** column *******")

        y_train = df_train[c]
        x_train = df_train.drop([c, 'ncodpers'], 1)
        clf = LogisticRegression()
        clf.fit(x_train, y_train)
        
        # 每個row選擇第2欄
        p_train = clf.predict_proba(x_train)[:,1]

        models[c] = clf #產品：模型
        model_preds[c] = p_train #產品:機率

        for id, p in zip(ids, p_train):
            id_preds[id].append(p)
        print(roc_auc_score(y_train, p_train))

ind_ahor_fin_ult1
****** column *******
0.792150558543
ind_aval_fin_ult1
****** column *******
0.979471273608
ind_cco_fin_ult1
****** column *******
0.650168684648
ind_cder_fin_ult1
****** column *******
0.788947355732
ind_cno_fin_ult1
****** column *******
0.971472644205
ind_ctju_fin_ult1
****** column *******
0.865553342852
ind_ctma_fin_ult1
****** column *******
0.661593708983
ind_ctop_fin_ult1
****** column *******
0.612791828474
ind_ctpp_fin_ult1
****** column *******
0.722375922208
ind_deco_fin_ult1
****** column *******
0.888840308649
ind_deme_fin_ult1
****** column *******
0.867520840878
ind_dela_fin_ult1
****** column *******
0.781944317185
ind_ecue_fin_ult1
****** column *******
0.775343267456
ind_fond_fin_ult1
****** column *******
0.856575049545
ind_hip_fin_ult1
****** column *******
0.918093547734
ind_plan_fin_ult1
****** column *******
0.848119443503
ind_pres_fin_ult1
****** column *******
0.709739096492
ind_reca_fin_ult1
****** column *******
0.834413130255
ind_tjcr_fin_

In [13]:
already_active = {}
for row in df_train.values:
    row = list(row)
    id = row.pop(0)
    #移除list中給定位置的項目>>>>移除ncodpers
    
    #選出每個顧客已經擁有的產品
    active = [c[0] for c in zip(df_train.columns[1:], row) if c[1] > 0]
    already_active[id] = active

In [25]:
# id = 1375586
print(already_active[1375586])

['ind_cco_fin_ult1', 'ind_recibo_ult1']


In [31]:
print(id_preds[1375586])

[4.2596560878065235e-05, 2.3572440973085176e-05, 0.92711565085862091, 0.00025642109982110527, 0.0092611805653184821, 1.1748493143428829e-06, 0.016360644691956836, 0.10944882954982063, 0.055201350688515526, 0.00015803611735271158, 0.00053592831449668398, 0.018302861379988978, 0.12154223427738729, 0.0083740260218876251, 0.013146299528762661, 0.0062412048556186785, 0.0013280384515513286, 0.090503270396081401, 0.049401971574750077, 0.023666775372114458, 0.0020094198375437688, 3.7254623456581045e-05, 0.00084906668685975103, 0.074965022270822884]


In [9]:
train_preds = {}
for id, p in id_preds.items():
    # 挑選每位顧客前7名的產品，並排除顧客已經擁有的產品
    preds = [i[0] for i in sorted([i for i in zip(df_train.columns[1:], p) if i[0] not in already_active[id]], key=lambda i:i [1], reverse=True)[:7]]
    train_preds[id] = preds

In [10]:
test_preds = []
for row in sample.values:
    # sample data的id
    id = row[0]
    # 找出train好的顧客，並處理成等等要匯出csv的樣子
    p = train_preds[id]
    test_preds.append(' '.join(p))

In [11]:
sample['added_products'] = test_preds
sample.to_csv('collab_sub.csv', index=False)