user特徴量  
参考：https://amalog.hateblo.jp/entry/kaggle-feature-management

In [None]:
import os
import gc
from pathlib import Path
import numpy as np
import pandas as pd
import datetime
from sklearn.preprocessing import OrdinalEncoder
import category_encoders as ce
from abc import ABC, abstractmethod
from tqdm import tqdm
import pickle
from collections import defaultdict
from typing import List, Dict, Any, Union

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)


In [None]:
# メモリ削減（数値カラムのみ）
def reduce_mem_usage_for_numeric(df):
    """iterate through  the numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if "int" in str(col_type) or "float" in str(col_type):
            c_min = df[col].min()
            c_max = df[col].max()
            if "int" in str(col_type):
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            elif "float" in str(col_type):
                # if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #     df[col] = df[col].astype(np.float16)# サポート対象故
                if (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df


In [None]:
# 基底クラス。データ出力/保存用のメソッドと、特徴量生成用の抽象メソッドを保有
class OttoFeatures(ABC):
    def __init__(self, output_dir):
        self.output_dir = output_dir
        self.name = self.__class__.__name__
        self.output_path = Path(self.output_dir) / f"{self.name}.pkl"

    @abstractmethod
    def create_features(self, df):
        """
        処理
        """
        self.output_df = df

    # 加工後のdfを返却
    def get(self) -> pd.DataFrame:
        return self.output_df

    # 加工後のdfを保存
    def save(self):
        self.output_df.to_pickle(self.output_path)


In [None]:
ROOT="" #コンペフォルダ
OUTPUT_DIR = ""

train = pd.read_pickle(
    f"{ROOT}/data/input/processed_data2/train.pkl"
)

test = pd.read_pickle(
    f"{ROOT}/data/input/processed_data2/test.pkl"
)

train_test = pd.concat([train, test]).reset_index(drop=True)




# base

In [None]:
class BaseUserFeatures(OttoFeatures):
    def create_features(self):
        user_features = test.groupby('session').agg(
            {'session': 'count', 'aid': 'nunique', 'type': 'mean'})
        user_features.columns = ['user_user_count','user_item_count', 'user_buy_ratio']
        
        for col in user_features.columns:
            user_features[col]=user_features[col].astype(float)
        user_features=reduce_mem_usage_for_numeric(user_features)
        self.output_df = user_features


In [None]:
buf = BaseUserFeatures(output_dir=OUTPUT_DIR)
# 特徴量生成
buf.create_features()


In [None]:
buf.get().head()


In [None]:
buf.save()
