好きな用途に利用できるnotebookです。  
データの可視化や簡単なモデルの構築などにご利用下さい。

## 必要なライブラリのimport

In [2]:
import warnings
import time
import sys
import datetime
import pickle
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', 500)

In [3]:
# メモリ削減
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## データの読み込み

In [18]:
# windows
if os.name == 'nt':
    path = '../../../data/processed/'
    data_path = '../../../data/elo-merchant-category-recommendation/'
else:
    if 'KAGGLE_DATA_PROXY_TOKEN' in os.environ.keys():
        path = '/kaggle/input/elo-merchant-category-recommendaton/'

# FIXME:
PATHを修正すること！！！

In [19]:
# FIXME: PATH
train_path = os.path.join(path,'processed20240618_train.csv')
test_path = os.path.join(path,'processed20240618_test.csv')

new_transactions_path = os.path.join(data_path,'new_merchant_transactions.csv')
historical_transactions_path = os.path.join(data_path,'historical_transactions.csv')



In [10]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
new_transactions = pd.read_csv(new_transactions_path,
                               parse_dates=['purchase_date'])
historical_transactions = pd.read_csv(historical_transactions_path,
                                      parse_dates=['purchase_date'])

In [16]:
df_original_train = ['card_id','first_active_month','feature_1','feature_2','feature_3']
# 現時点(20240618,hoji_model)でfeature_importanceの高い値を確認する
df_importance_top10 = ['new_purchase_date_max', 'auth_purchase_date_max',
       'auth_month_diff_mean', 'hist_month_diff_mean',
       'new_purchase_amount_max', 'new_purchase_date_ptp',
       'new_purchase_month_mean', 'auth_purchase_date_ptp',
       'new_month_lag_mean', 'purchase_amount_max_mean']
# 何が役に立ってるんだこれ？
df_train[df_importance_top10].head(20)
# 日付の値めちゃくちゃ大きくなってるけどこれでいいのかなあっていうのと
# month_diffはcurrent_timeじゃなくて基準日決めたほうがいいよねって思う

Unnamed: 0,new_purchase_date_max,auth_purchase_date_max,auth_month_diff_mean,hist_month_diff_mean,new_purchase_amount_max,new_purchase_date_ptp,new_purchase_month_mean,auth_purchase_date_ptp,new_month_lag_mean,purchase_amount_max_mean
0,1525001000.0,1519551000.0,76.744939,76.846154,-0.2961,4742309.0,3.478261,20977987.0,1.478261,-0.158815
1,1522393000.0,1517438000.0,77.731563,77.545455,-0.7017,4887632.0,2.5,33717687.0,1.5,0.510664
2,1524937000.0,1519759000.0,76.731707,77.0,-0.7,0.0,4.0,35635623.0,2.0,-0.62109
3,1524049000.0,1519818000.0,76.636364,,-0.567,3625505.0,3.714286,13375339.0,1.714286,-0.097344
4,1524941000.0,1519850000.0,76.664062,76.4,0.451,4949682.0,3.555556,9405641.0,1.555556,2.259349
5,1505510000.0,1501343000.0,83.8125,83.0,-9e-05,2717302.0,8.75,17274145.0,1.75,-0.240106
6,1523037000.0,1519402000.0,76.884774,76.882353,-0.6885,2697311.0,3.2,35783408.0,1.2,-0.635468
7,1518986000.0,1513885000.0,78.473684,78.666667,-0.6973,1085016.0,2.0,7251596.0,2.0,-0.347811
8,1521581000.0,1512825000.0,77.0,77.0,-0.5806,284431.0,3.0,9751281.0,1.0,-0.515289
9,1523196000.0,1519837000.0,76.848485,76.928571,-0.666,621055.0,4.0,35859021.0,2.0,-0.555582


In [13]:
df_train[df_importance_top10].isnull().sum()
# なんで欠損してるのか考えたほうがいいなこれ(今度)ー＞多分newにデータはいってないやつがNULLになってるてことだと思うんだよな(newの欠損値がすべて同じのため)(histもhistにデータ入ってないやつなんじゃないかなあ)


new_purchase_date_max       21931
auth_purchase_date_max          0
auth_month_diff_mean            0
hist_month_diff_mean        30959
new_purchase_amount_max     21931
new_purchase_date_ptp       21931
new_purchase_month_mean     21931
auth_purchase_date_ptp          0
new_month_lag_mean          21931
purchase_amount_max_mean        0
dtype: int64

In [20]:
# 日付のデータのMAXってこれでいいのかなあっていうのと
# new_transactions

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.569580,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415bb3a509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415bb3a509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,,-1,8
4,Y,C_ID_ef55cf8d4b,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,,-1,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1963026,Y,C_ID_1320dee851,142,N,0,A,309,M_ID_7754b67f3b,2,-0.701828,2018-04-06 14:36:52,3.0,19,21
1963027,Y,C_ID_f112aa3381,158,N,0,A,560,M_ID_da063195b7,2,-0.694390,2018-03-07 13:19:18,1.0,15,34
1963028,Y,C_ID_bd97b86450,69,N,1,B,278,M_ID_9a9ccb6544,1,-0.621031,2018-03-05 12:04:56,1.0,9,37
1963029,Y,C_ID_c0513fd84f,130,N,0,A,367,M_ID_40c28d596f,1,-0.656749,2018-03-09 14:47:05,3.0,7,16
