In [1]:
!pip install -r requirements.txt
import pandas as pd
import numpy as np
from datetime import datetime
import holidays

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, f1_score, silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.cluster import KMeans
import category_encoders as ce

import lightgbm as lgb
from tslearn.clustering import TimeSeriesKMeans, KShape
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.metrics import cdist_dtw
from tslearn.barycenters import dtw_barycenter_averaging

import warnings
import requests
import os
import tqdm
from scipy.spatial import distance, KDTree

from google.cloud import bigquery
from google.oauth2 import service_account

warnings.simplefilter('ignore')

Collecting lightgbm==3.3.2 (from -r requirements.txt (line 1))
  Using cached lightgbm-3.3.2-py3-none-manylinux1_x86_64.whl (2.0 MB)
Collecting holidays (from -r requirements.txt (line 2))
  Obtaining dependency information for holidays from https://files.pythonhosted.org/packages/5d/8c/bad7c11afc8969834728c0678bbf8f3ec5dba4c4ac7f5ad8cf91d63e865f/holidays-0.40-py3-none-any.whl.metadata
  Using cached holidays-0.40-py3-none-any.whl.metadata (21 kB)
Collecting pandas_datareader (from -r requirements.txt (line 3))
  Using cached pandas_datareader-0.10.0-py3-none-any.whl (109 kB)
Collecting tslearn (from -r requirements.txt (line 4))
  Obtaining dependency information for tslearn from https://files.pythonhosted.org/packages/97/22/8dba9a7149d51fe0b6163a5a6b7efc315ab3c097cb6b0d1fc649a03f2722/tslearn-0.6.3-py3-none-any.whl.metadata
  Using cached tslearn-0.6.3-py3-none-any.whl.metadata (14 kB)
Collecting category_encoders (from -r requirements.txt (line 5))
  Obtaining dependency information 

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [None]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
target = pd.read_csv('target.csv')

def category(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype('category')
    return df
def zero_process(df):
    temp_target = target.copy()
    temp_target.drop(columns=['product_ProductName','mean_used_num'], inplace=True)
    temp_target['date'] = ''
    concatenated_df = pd.DataFrame()
    for date in df['date'].unique():
        temp_target['date'] = date
        concatenated_df = pd.concat([concatenated_df, temp_target], axis=0)
    df = pd.concat([df, concatenated_df]).sort_values(by='date')
    df.fillna({'lineItem_UsageAccountId': 0, 'sum_num_machine': 0}, inplace=True)
    df.reset_index(drop=True)
    return df

def get_test(df):
    df = zero_process(df)
    df.drop(['lineItem_UsageAccountId','sum_num_machine'],axis=1,inplace=True)
    df = df.drop_duplicates(subset=['date', 'customer','product_region','product_operatingSystem','product_instanceType'])
    df = df.merge(target, left_on=['customer', 'product_region', 'product_operatingSystem','product_instanceType'], right_on=['customer', 'product_region', 'product_operatingSystem','product_instanceType'])
    df.drop(['product_ProductName','mean_used_num'],axis=1,inplace=True)    

    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    
    country_holidays = holidays.CountryHoliday('JP')
    df['weekday'] = None

    for index, row in df.iterrows():
        current_date = pd.to_datetime(row['date'])
        df.at[index, 'weekday'] = current_date.strftime("%A")
    df=category(df)
    #df.drop('date',axis=1,inplace=True)
    return df
    
def get_train(df):
    df = zero_process(df)
    df.drop('lineItem_UsageAccountId', axis=1, inplace=True)
    df['total_sum_num_machine'] = df.groupby(['date', 'customer','product_region','product_operatingSystem','product_instanceType'])['sum_num_machine'].transform('sum')
    df = df.drop_duplicates(subset=['date', 'customer','product_region','product_operatingSystem','product_instanceType'])
    df = df.merge(target, left_on=['customer', 'product_region', 'product_operatingSystem','product_instanceType'], right_on=['customer', 'product_region', 'product_operatingSystem','product_instanceType'])
    df.drop(['product_ProductName','mean_used_num','sum_num_machine'],axis=1,inplace=True)
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    
    country_holidays = holidays.CountryHoliday('JP')
    df['weekday'] = None

    for index, row in df.iterrows():
        current_date = pd.to_datetime(row['date'])
        df.at[index, 'weekday'] = current_date.weekday()
    df=category(df)
    df.drop('date',axis=1,inplace=True)
    
    return df

df = get_train(df)
test = get_test(test)

In [None]:
y = df['total_sum_num_machine']
X = df.drop('total_sum_num_machine',axis=1)

## サイクリカルエンコーディング

In [None]:
import numpy as np

def cyclical_encoding(df):
    df['sin_month'] = np.sin(2 * np.pi * df['month'] / 12)
    df['cos_month'] = np.cos(2 * np.pi * df['month'] / 12)
    df['weekday'] = df['weekday'].astype('int32')
    df['sin_day_of_week'] = np.sin(2 * np.pi * df['weekday'] / 7)
    df['cos_day_of_week'] = np.cos(2 * np.pi * df['weekday'] / 7)


    condition_31_days = df['month'].isin([1, 3, 5, 7, 8, 10, 12])
    condition_30_days = df['month'].isin([4, 6, 9, 11])
    condition_28_days = ~condition_31_days & ~condition_30_days

    max_days = np.select([condition_31_days, condition_30_days, condition_28_days], [31, 30, 28])

    df['sin_day'] = np.sin(2 * np.pi * df['day'] / max_days)
    df['cos_day'] = np.cos(2 * np.pi * df['day'] / max_days)

    return df

In [None]:
X = cyclical_encoding(X)
X

## 単純ターゲットエンコーディング

In [None]:
from sklearn.model_selection import KFold
def simple_target_enc(col):
    #skf = StratifiedKFold(n_splits=5)
    kf = KFold(n_splits=5)
    encoded_features = []
    
    for train_idx, val_idx in kf.split(X,y):
        X_train_, X_valid_ = X.iloc[train_idx], X.iloc[val_idx]
        y_train_ = y.iloc[train_idx]

        target_encoder = ce.TargetEncoder()
        target_encoder.fit(X_train_[col], y_train_)

        X_valid_[f'target_{col}'] = target_encoder.transform(X_valid_[col])
        encoded_features.append(X_valid_)


    encoded_df = pd.concat(encoded_features).sort_index()
    df_with_encoded = pd.merge(X, encoded_df[[f'target_{col}']], left_index=True, right_index=True, how='left')

    target_encoder = ce.TargetEncoder()
    target_encoder.fit(df_with_encoded[col], y)

    test[f'target_{col}'] = target_encoder.transform(test[col])

    df_with_encoded.drop(col,axis=1,inplace=True)
    test.drop(col,axis=1,inplace=True)
    
    return df_with_encoded, test

In [None]:
X,test = simple_target_enc('product_instanceType')
X

## 