item特徴量  
参考：https://amalog.hateblo.jp/entry/kaggle-feature-management

In [None]:
import os
import gc
from pathlib import Path
from collections import Counter
from cuml import UMAP, TSNE, PCA, KMeans
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import datetime
from gensim.models import Word2Vec
# from sklearn.manifold import TSNE
from sklearn.preprocessing import OrdinalEncoder
import category_encoders as ce
from abc import ABC, abstractmethod
from tqdm import tqdm
import pickle
from collections import defaultdict
from typing import List, Dict, Any, Union

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)


In [None]:
# メモリ削減（数値カラムのみ）
def reduce_mem_usage_for_numeric(df):
    """iterate through  the numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if "int" in str(col_type) or "float" in str(col_type):
            c_min = df[col].min()
            c_max = df[col].max()
            if "int" in str(col_type):
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            elif "float" in str(col_type):
                # if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #     df[col] = df[col].astype(np.float16)# サポート対象故
                if (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df


In [None]:
# 基底クラス。データ出力/保存用のメソッドと、特徴量生成用の抽象メソッドを保有
class OttoFeatures(ABC):
    def __init__(self, output_dir):
        self.output_dir = output_dir
        self.name = self.__class__.__name__
        self.output_path = Path(self.output_dir) / f"{self.name}.pkl"

    @abstractmethod
    def create_features(self, df):
        """
        処理
        """
        self.output_df = df

    # 加工後のdfを返却
    def get(self) -> pd.DataFrame:
        return self.output_df

    # 加工後のdfを保存
    def save(self):
        self.output_df.to_pickle(self.output_path)


In [None]:
ROOT="" #コンペフォルダ
OUTPUT_DIR = ""

train = pd.read_pickle(
    f"{ROOT}/data/input/trim_train_3week.pkl"
)

valid_label = pd.read_pickle(
    f"{ROOT}/data/input/valid_label_1week.pkl"
)
valid_session = pd.read_pickle(
    f"{ROOT}/data/input/valid_trimed_session_1week.pkl"
)

In [None]:
train_valid = pd.concat([train, valid_session]).reset_index(drop=True)

# base

In [None]:
class BaseItemFeatures(OttoFeatures):
    def create_features(self):
        item_features = train_valid.groupby("aid").agg(
            {"aid": "count", "session": "nunique", "type": "mean"}
        )
        item_features.columns = ["item_item_count", "item_user_count", "item_buy_ratio"]
        for col in item_features.columns:
            item_features[col]=item_features[col].astype(float)
        
        item_features=reduce_mem_usage_for_numeric(item_features)
        self.output_df = item_features


In [None]:
bif = BaseItemFeatures(output_dir=OUTPUT_DIR)
# 特徴量生成
bif.create_features()



In [None]:
bif.get().head()

In [None]:
bif.get().info()

In [None]:
bif.save()

# type count

In [None]:
class ItemCountFeatures(OttoFeatures):
    def create_features(self):
        item_features = train_valid.groupby("aid").agg(
            {
                "type": [
                    lambda x: float(Counter(x)[0]),
                    lambda x: float(Counter(x)[1]),
                    lambda x: float(Counter(x)[2]),
                ]
            }
        )
        item_features.columns = [
            "item_click_count",
            "item_cart_count",
            "item_order_count",
        ]
        
        item_features= reduce_mem_usage_for_numeric(item_features)
        self.output_df = item_features


In [None]:
icf = ItemCountFeatures(output_dir=OUTPUT_DIR)
# 特徴量生成
icf.create_features()


In [None]:
icf.get().head()

In [None]:
icf.save()

# valid only popularity features

In [None]:
#valid only
class PopularityFeatures(OttoFeatures):
    def create_features(self):
        popularity=valid_session.aid.value_counts().to_frame().reset_index()
        popularity=popularity.rename(columns={'index':'aid', 'aid':'popularity'})
        
        popularity=reduce_mem_usage_for_numeric(popularity)
        self.output_df = popularity


In [None]:
pf=PopularityFeatures(OUTPUT_DIR)
pf.create_features()

In [None]:
pf.get().head()

In [None]:
pf.save()

# valid only type count

In [None]:
#valid_only
class ItemCountFeatures2(OttoFeatures):
    def create_features(self):
        item_features = valid_session.groupby("aid").agg(
            {
                "type": [
                    lambda x: float(Counter(x)[0]),
                    lambda x: float(Counter(x)[1]),
                    lambda x: float(Counter(x)[2]),
                ]
            }
        )
        item_features.columns = [
            "item_click_count2",
            "item_cart_count2",
            "item_order_count2",
        ]
        
        item_features= reduce_mem_usage_for_numeric(item_features)
        self.output_df = item_features


In [None]:
icf2=ItemCountFeatures2(OUTPUT_DIR)
icf2.create_features()

In [None]:
icf2.get().head()

In [None]:
icf2.save()

# valid only base

In [None]:
#valid only
class BaseItemFeatures2(OttoFeatures):
    def create_features(self):
        item_features = valid_session.groupby("aid").agg(
            {"aid": "count", "session": "nunique", "type": "mean"}
        )
        item_features.columns = ["item_item_count2", "item_user_count2", "item_buy_ratio2"]
        for col in item_features.columns:
            item_features[col]=item_features[col].astype(float)
        
        item_features=reduce_mem_usage_for_numeric(item_features)
        self.output_df = item_features


In [None]:
bif2=BaseItemFeatures2(OUTPUT_DIR)
bif2.create_features()

In [None]:
bif2.get().head()

In [None]:
bif2.save()