In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor, Pool
import pickle
from sklearn.linear_model import LinearRegression
from tqdm import tqdm
import warnings
import gc
import math
from sklearn.cluster import KMeans
from collections import Counter
import pickle

In [2]:
with open('models/main_model.pkl', 'rb') as f:
    main_model = pickle.load(f)

with open('models/cluster_model.pkl', 'rb') as f:
    cluster_model, cluster_classes = pickle.load(f)
    
with open('models/adopt_main_model.pkl', 'rb') as f:
    adopt_main_model = pickle.load(f)
    
with open('models/adopt_cluster_model.pkl', 'rb') as f:
    adopt_cluster_model, adopt_cluster_classes = pickle.load(f)
    
id = 7

In [3]:
test_df = pd.read_csv(f'inference_data/data/test{id}.csv', sep='\t')
test_df['StockStatus'] = test_df['StockStatus'].str.lower().replace({'instock': 1, 'outofstock': 0}).astype(np.int8)

test_df['DateObserve'] = pd.to_datetime(test_df['DateObserve'])
a = LabelEncoder().fit(test_df['WebPriceId'])
test_df['WebPriceId'] = a.transform(test_df['WebPriceId'])

In [4]:
test_df['month'] = test_df['DateObserve'].dt.month
test_df['year'] = test_df['DateObserve'].dt.year
test_df['day'] = test_df['DateObserve'].dt.day

test_df['year'] -= test_df['year'].min()
test_df['num_month'] = test_df['year'] * 12 + test_df['month']
test_df['num_day'] = test_df['year']*366 + test_df['month'] * 31 + test_df['day']
test_df = test_df.drop(['DateObserve', 'month', 'year'], axis=1)

test_df = test_df[test_df['num_month']>=test_df['num_month'].max()-7]

tmp_df = test_df[test_df['StockStatus']==1][['WebPriceId', 'num_month']].drop_duplicates().groupby('WebPriceId').count()
goods = tmp_df[tmp_df['num_month']>=2].index
test_df = test_df[test_df['WebPriceId'].isin(goods)]

test_df = test_df.sort_values('day').reset_index(drop=True)

In [5]:
features = []

goods = test_df[test_df['num_month']>=2].index

cur_main_df = test_df[((test_df['num_month']<=test_df['num_month'].max())&(test_df['num_month']>=test_df['num_month'].max()-3))]
        
grouped = cur_main_df[cur_main_df['WebPriceId'].isin(goods)].groupby(['num_month', 'WebPriceId']).agg({'CurrentPrice': 'last'}).unstack()
grouped = grouped.fillna(method='ffill').fillna(method='bfill')
grouped.columns = grouped.columns.droplevel(0)

month_change = np.exp(np.log((grouped.pct_change()+1)))
month_change = month_change[month_change.index!=test_df['num_month'].max()-3]

values = month_change.values.transpose(1,0)
main = [values[x] for x in range(len(month_change.columns)) if np.array(values[x]).max() < 2.5 and np.array(values[x]).min()>0.4]
for j in range(len(main)):
    new_val = [x for x in main[j] if x != 1]
    main[j] = new_val
    
cluster_features = [[np.mean(y), np.median(y), np.std(y), len(y)] for y in main if len(y) > 0]
preds = cluster_model.predict(cluster_features)
c = month_change.columns.tolist()

for j in cluster_classes:
    good_cols = [c[x] for x in range(len(preds)) if preds[x] == j]
    cur_df = test_df[test_df['WebPriceId'].isin(good_cols)]
    
    tmp_df = cur_df[['WebPriceId', 'num_month']].drop_duplicates().groupby('WebPriceId').count()
    goods = tmp_df[tmp_df['num_month']>=2].index
    cur_df = cur_df[cur_df['WebPriceId'].isin(goods)]

    cur_df = cur_df[cur_df['WebPriceId'].isin(cur_df[cur_df['num_month']==test_df['num_month'].max()]['WebPriceId'].unique())]

    grouped = cur_df.groupby(['num_month', 'WebPriceId']).agg({'CurrentPrice': 'last'}).unstack()
    grouped = grouped.fillna(method='ffill').fillna(method='bfill')
    grouped.columns = grouped.columns.droplevel(0)


    month_change = np.exp(np.log((grouped.pct_change()+1)))
    month_change = month_change[month_change.index==test_df['num_month'].max()]

    values = month_change.dropna(axis=1).values[0]
    sort_values = [x for x in values if 0.4 < x < 2.5]
    
    features.append([np.average(sort_values, weights=[0.2 if x > 1.6 else 1 for x in sort_values])-1])
    features.append([np.average(sort_values)-1, np.std(sort_values)])
    
features = np.concatenate(features)

pred = main_model.predict([features])[0]

with open(f'inference_data/submits/test_{id}.txt', 'w') as f:
    f.write(str(pred))

In [6]:
features = []

goods = test_df[test_df['num_month']>=2].index

cur_main_df = test_df[((test_df['num_month']<=test_df['num_month'].max())&(test_df['num_month']>=test_df['num_month'].max()-3))]
        
grouped = cur_main_df[cur_main_df['WebPriceId'].isin(goods)].groupby(['num_month', 'WebPriceId']).agg({'CurrentPrice': 'last'}).unstack()
grouped = grouped.fillna(method='ffill').fillna(method='bfill')
grouped.columns = grouped.columns.droplevel(0)

month_change = np.exp(np.log((grouped.pct_change()+1)))
month_change = month_change[month_change.index!=test_df['num_month'].max()-3]

values = month_change.values.transpose(1,0)
main = [values[x] for x in range(len(month_change.columns)) if np.array(values[x]).max() < 2.5 and np.array(values[x]).min()>0.4]
for j in range(len(main)):
    new_val = [x for x in main[j] if x != 1]
    main[j] = new_val
    
cluster_features = [[np.mean(y), np.median(y), np.std(y), len(y)] for y in main if len(y) > 0]
preds = adopt_cluster_model.predict(cluster_features)
c = month_change.columns.tolist()

for j in adopt_cluster_classes:
    good_cols = [c[x] for x in range(len(preds)) if preds[x] == j]
    cur_df = test_df[test_df['WebPriceId'].isin(good_cols)]
    
    tmp_df = cur_df[['WebPriceId', 'num_month']].drop_duplicates().groupby('WebPriceId').count()
    goods = tmp_df[tmp_df['num_month']>=2].index
    cur_df = cur_df[cur_df['WebPriceId'].isin(goods)]

    cur_df = cur_df[cur_df['WebPriceId'].isin(cur_df[cur_df['num_month']==test_df['num_month'].max()]['WebPriceId'].unique())]

    grouped = cur_df.groupby(['num_month', 'WebPriceId']).agg({'CurrentPrice': 'last'}).unstack()
    grouped = grouped.fillna(method='ffill').fillna(method='bfill')
    grouped.columns = grouped.columns.droplevel(0)


    month_change = np.exp(np.log((grouped.pct_change()+1)))
    month_change = month_change[month_change.index==test_df['num_month'].max()]

    values = month_change.dropna(axis=1).values[0]
    sort_values = [x for x in values if 0.4 < x < 2.5]
    
    features.append([np.average(sort_values, weights=[0.2 if x > 1.6 else 1 for x in sort_values])-1])
    features.append([np.average(sort_values)-1, np.std(sort_values)])
    
features = np.concatenate(features)

pred = adopt_main_model.predict([features])[0]

with open(f'inference_data/submits/test_adp_{id}.txt', 'w') as f:
    f.write(str(pred))