In [None]:
import pandas as pd
import numpy as np
import os
import glob
import random
from math import cos, radians

from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [None]:
train = pd.read_csv('../data/train/train.csv')
submission = pd.read_csv('../data/sample_submission.csv')

submission['time'] = submission['id'].apply(lambda x: x.split('_')[2].replace('h', '.').replace('m', '')).astype(float)
zero_idx = submission.loc[(submission['time']<=4.3) | (submission['time']>=19.3)].index
submission.drop(columns='time', inplace=True)

In [None]:
def load(day_range=1):
    train_list = []
    tar_list = []
    for day in tqdm(range(day_range-1, int(train.shape[0]/48-2))):
        temp = train.loc[(train['Day']<=day) & (train['Day']>day-day_range), ['DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET']]
        temp2 = []
        for col in ['DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET']:
            temp2 += [temp[col].values.reshape(-1)]
        temp = np.concatenate(temp2)
        target = train.loc[train['Day']==day+1, 'TARGET'].values.tolist()
        target2 = train.loc[train['Day']==day+2, 'TARGET'].values.tolist()
        target += target2
        train_list += [temp]
        tar_list += [target]
    train_arr = np.concatenate([train_list], 1)
    tar_arr = np.concatenate([tar_list], 1)
    print(train_arr.shape, tar_arr.shape)

    test_list = []
    for i in tqdm(range(81)):
        file_path = '../data/test/' + str(i) + '.csv'
        temp = pd.read_csv(file_path)
        temp2 = []
        temp = temp.loc[(temp['Day']<=6) & (temp['Day']>6-day_range), ['DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET']]
        for col in ['DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET']:
            temp2 += [temp[col].values.reshape(-1)]
        temp = np.concatenate(temp2)
        test_list += [temp]
    test_arr = np.concatenate([test_list], 1)
    print(test_arr.shape)
    
    return train_arr, tar_arr, test_arr

In [None]:
predictions = {}

for day_range in range(3, 8):
    train_arr, tar_arr, test_arr = load(day_range=day_range)
    
    #################### nunique > 1
    not_unique_features = pd.DataFrame(train_arr).nunique()>1
    train_arr = train_arr[:, not_unique_features]
    test_arr = test_arr[:, not_unique_features]
        
    #################### Month Feature
    days = [int(i) for i in list('1'*31 + '2'*28 + '3'*31 + '4'*30 + '5'*31 + '6'*30 + '7'*31 + '8'*31 + '9'*30)]
    days += [10 for i in range(31)]
    days += [11 for i in range(30)]
    days += [12 for i in range(31)]
    days += days + days
    days = np.array(days[day_range-1:-2])
    
    m = RandomForestClassifier(n_estimators=1000, max_depth=10)
    m.fit(train_arr, days)

    train_arr = np.concatenate([train_arr, days.reshape(-1, 1)], 1)
    test_arr = np.concatenate([test_arr, m.predict(test_arr).reshape(-1, 1)], 1)
    
    #################### Model fit
    rf = RandomForestRegressor(n_estimators=1000,
                            criterion='mae',
                            max_features=10,
                            max_depth=10,
                            random_state=0,
                            n_jobs=12)
    rf.fit(train_arr, tar_arr)

    #################### filter importance features
    importance_features = rf.feature_importances_>0.001
    print(sum(importance_features))

    rf.fit(train_arr[:, importance_features], tar_arr)
    
    #################### inference
    rf_preds = []
    quantile_preds = []

    for estimator in rf.estimators_:
        rf_preds.append(estimator.predict(test_arr[:, importance_features]).reshape(-1))
    rf_preds = np.array(rf_preds)

    for i, q in enumerate(quantiles):
        quantile_preds += [np.percentile(rf_preds, q * 100, axis=0)]

    predictions[day_range] = quantile_preds
    
    for i in range(3, day_range+1):
        print(np.sum(predictions[i]))

In [None]:
submission.iloc[:, 1:] = np.clip(lgb[7].round(2), 0, 100)
submission.loc[zero_idx, 'q_0.1':]=0
submission.to_csv('../submit/rf_model.csv', index=False)