In [1]:
import pandas as pd
import numpy as np
import sys
from IPython.core.debugger import Tracer; debug_here = Tracer()


coupon_list_train = pd.read_csv('../input/coupon_list_train.csv')
coupon_list_test = pd.read_csv('../input/coupon_list_test.csv')
user_list = pd.read_csv('../input/user_list.csv')
coupon_purchase_train = pd.read_csv('../input/coupon_detail_train.csv')
user_list = pd.read_csv('../input/user_list.csv')

purchased_coupons_train = coupon_purchase_train.merge(coupon_list_train,
                                                 on='COUPON_ID_hash',
                                                 how='inner')
                                                 
## ユーザーの年齢を年齢層に分ける

generation = [10,20,30,40,50,60,100]
generation_name = ['10代','20代','30代','40代','50代','60以上']

def conv_age2gen(user_list, generation, generation_name): #データフレームは引数としてコピーを渡すので，値を変えたら呼び出し元でも値が変わる
    user_list['GENERATION'] = ' '
    for i in range(len(generation)-1):
        user_list['GENERATION'][(user_list['AGE']>=generation[i]) & (user_list['AGE']<generation[i+1])] = generation_name[i]

conv_age2gen(user_list, generation, generation_name)

purchased_coupons_train = pd.merge(purchased_coupons_train, user_list, on='USER_ID_hash', how='inner')

### filter redundant features
features = ['COUPON_ID_hash', 'USER_ID_hash',
            'SEX_ID','GENERATION','GENRE_NAME', 'DISCOUNT_PRICE', 'PRICE_RATE',
            'USABLE_DATE_MON', 'USABLE_DATE_TUE', 'USABLE_DATE_WED', 'USABLE_DATE_THU',
            'USABLE_DATE_FRI', 'USABLE_DATE_SAT', 'USABLE_DATE_SUN', 'USABLE_DATE_HOLIDAY',
            'USABLE_DATE_BEFORE_HOLIDAY', 'large_area_name', 'ken_name', 'small_area_name']
purchased_coupons_train = purchased_coupons_train[features]

### create 'dummyuser' records in order to merge training and testing sets in one
coupon_list_test['USER_ID_hash'] = 'dummyuser'
coupon_list_test['SEX_ID'] = 'dummysex'
coupon_list_test['GENERATION'] = 'dummygeneration'

### filter testing set consistently with training set
coupon_list_test = coupon_list_test[features]

### merge set together
combined = pd.concat([purchased_coupons_train, coupon_list_test], axis=0)

### create two new features
combined['DISCOUNT_PRICE'] = 1 / np.log10(combined['DISCOUNT_PRICE'])
combined['PRICE_RATE'] = (combined['PRICE_RATE'] / 100) ** 2


### convert categoricals to OneHotEncoder form
categoricals = ['SEX_ID','GENERATION','GENRE_NAME', 'USABLE_DATE_MON', 'USABLE_DATE_TUE', 'USABLE_DATE_WED',
                'USABLE_DATE_THU', 'USABLE_DATE_FRI', 'USABLE_DATE_SAT', 'USABLE_DATE_SUN',
                'USABLE_DATE_HOLIDAY', 'USABLE_DATE_BEFORE_HOLIDAY', 'large_area_name', 'ken_name', 'small_area_name']
combined_categoricals = combined[categoricals]
combined_categoricals = pd.get_dummies(combined_categoricals,
                                    dummy_na=False)

### leaving continuous features as is, obtain transformed dataset
continuous = list(set(features) - set(categoricals))
combined = pd.concat([combined[continuous], combined_categoricals], axis=1)

### remove NaN values
NAN_SUBSTITUTION_VALUE = 1
combined = combined.fillna(NAN_SUBSTITUTION_VALUE)

### split back into training and testing sets
train = combined[combined['USER_ID_hash'] != 'dummyuser']
test = combined[combined['USER_ID_hash'] == 'dummyuser']
test.drop('USER_ID_hash', inplace=True, axis=1) 


### アイテムごとの特徴行列を作成
item_train_profiles = train.groupby('COUPON_ID_hash').mean()
item_test_profiles = test.groupby('COUPON_ID_hash').mean()

### アイテム間の類似度を計算
item_similarity_scores = np.dot(item_test_profiles, item_train_profiles.T)

## アイテム間の類似度をDataFrameに直す
index_itemsim = item_test_profiles.index
columns_itemsim = item_train_profiles.index
itemsim_df = pd.DataFrame(index=index_itemsim, columns=columns_itemsim, data=item_similarity_scores)

### アイテム間の類似度 top100 をリストで返す関数
def get_top100_user_hashes_list(row):
    row.sort()

    return row.index[-100:][::-1].tolist()

top100_simitem = itemsim_df.apply(get_top100_user_hashes_list, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [13]:
index_top100 = top100_simitem.values[:len(top100_simitem)]##各アイテム類似度 top100 のインデックス

In [17]:
#tem_train_profiles.ix[index_top100[1]].mean()
#len(test_index[0])
#len(index_top100)

In [19]:
customer_feature = ['SEX_ID_f','SEX_ID_m','GENERATION_10代','GENERATION_20代','GENERATION_30代',
                                        'GENERATION_40代','GENERATION_50代','GENERATION_60以上']

In [20]:
temp_df = pd.DataFrame(index=item_test_profiles.index, columns=customer_feature)
for i in range(len(index_top100)):
    temp_df.ix[i] = item_train_profiles.ix[index_top100[i]].mean()

In [82]:
item_test_profiles[customer_feature] = temp_df[customer_feature]

In [33]:
train_dropped_coupons = train.drop('COUPON_ID_hash', axis=1)
user_profiles = train_dropped_coupons.groupby(by='USER_ID_hash').apply(np.mean)

In [39]:
### creating weight matrix for features
### 各特徴の重みを格納した辞書型オブジェクトの生成
FEATURE_WEIGHTS = {
    'GENRE_NAME': 2,
    'DISCOUNT_PRICE': 2,
    'PRICE_RATE': 0,
    'USABLE_DATE_': 0,
    'large_area_name': 0.5,
    'ken_name': 1,
    'small_area_name': 5,
    'SEX_ID': 2,
    'GENERATION': 1
}

# dict lookup helper
# 与えられた特徴の名前に対して，辞書に登録してある重みを返す
def find_appropriate_weight(weights_dict, colname):
    for col, weight in weights_dict.items(): #辞書のitems()メソッドで全ての key(col), value(weight) をたどる
        if col in colname:  #各ダミー特徴には それぞれの元のキー名が先頭に付いているので，その文字列が含まれるかどうかで判断できる
            return weight
    raise ValueError #呼び出し元にエラーを返し処理を任せる

W_values = [find_appropriate_weight(FEATURE_WEIGHTS, colname)
            for colname in user_profiles.columns] # for文で代入することで n×1 行列を作ることが出来る
W = np.diag(W_values)



In [49]:

similarity_scores = np.dot(np.dot(user_profiles, W),
                           item_test_profiles.T)

coupons_ids = test['COUPON_ID_hash']
index = user_profiles.index
columns = [coupons_ids[i] for i in range(0, similarity_scores.shape[1])]
result_df = pd.DataFrame(index=index, columns=columns,
                      data=similarity_scores)

### obtain string of top10 hashes according to similarity scores for every user
def get_top10_coupon_hashes_string(row):
    row.sort()
    return ' '.join(row.index[-10:][::-1].tolist()) #スーペースを区切り文字として，index（ = COUPON_ID）top 10 を結合したものをかえす
                                                    #[-10:]は最後から10個目から最後までを指定
                                                    #[::-1]は取り出した要素を逆順にする
                                                    #.tolist でリスト化

output = result_df.apply(get_top10_coupon_hashes_string, axis=1)

In [59]:
output_df = pd.DataFrame(data={'USER_ID_hash': output.index,
                               'PURCHASED_COUPONS': output.values})
output_df_all_users = pd.merge(user_list, output_df, how='left', on='USER_ID_hash')
output_df_all_users.to_csv('cosine_sim_python_plus_SEXID&GENERATION.csv', header=True,
                           index=False, columns=['USER_ID_hash', 'PURCHASED_COUPONS'])

In [60]:
output_df_all_users.columns

Index(['REG_DATE', 'SEX_ID', 'AGE', 'WITHDRAW_DATE', 'PREF_NAME',
       'USER_ID_hash', 'GENERATION', 'PURCHASED_COUPONS'],
      dtype='object')

In [61]:
top100_simitem

COUPON_ID_hash
007dce05be7f473f362e79bfe7256736    [b083e69497a41124a2f401c5e7079d32, 4ae23f01dd4...
00fcc93438a282f8b915777a209dd0bd    [043836a27b467325380ee4bc08d52dc9, 200d3717644...
01023eac71c13f529f43cfb691d914c9    [5a46b5cc91d6397e61700052a4be5e1e, 60e42979b37...
0147fd072c6ef4e299d428f19a5f0e01    [35062a0e618c270ca61336586d9ae70f, 23ce3cd93d9...
017956f30c352df708358a6d1219fb12    [043836a27b467325380ee4bc08d52dc9, 200d3717644...
01f995ae8c168d5eb7b0fd69cf8ee6b2    [4ae23f01dd4d5e20e4f6347ab3a66ac1, 1a613708a7e...
03876bc74d321132f40d2ad6c8c420f9    [7e02c76a89e4b495d37e5ad83cca8a1c, c5dd8bc3e1c...
059a13d7779c38f345a132280caa05ed    [4ae23f01dd4d5e20e4f6347ab3a66ac1, 1a613708a7e...
05c58bb36b58b4928c6b0a46a0cedfeb    [ea60bdfa51a6b065610ba5a9f5668777, 33b07d89407...
06696e5b4afee54a10cc1c0637a83913    [f2b03a8f46704f38e6903a4e6f474f6d, 23ce3cd93d9...
0855a38feec9156f8849b7b89eaccdec    [23ce3cd93d95d2985d85f656655a100a, 35062a0e618...
0871d5de805bf7f26fca485d24ec65e9    [90