In [1]:
import pandas as pd
import numpy as np
import os
from IPython.display import display
from sklearn.model_selection import train_test_split
from tensorflow.random import set_seed
set_seed(69)

import missingno as msno
%matplotlib inline

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from fancyimpute import SoftImpute, MatrixFactorization, BiScaler
# from fancyimpute import IterativeSVD, NuclearNormMinimization, SimilarityWeightedAveraging
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
# # By default, a module has some hidden variables defined
# print({k: v for k, v in globals().items() if not k.startswith("__")})

In [3]:
''' 
data dim = [8, 8, 13, 8, 4, 11, 6]
train size = [765, 574, 371, 6064, 7077, 1187, 231]
test size = [309, 230, 151, 2457, 2870, 479, 92]
train / test ~= 2.5
'''
for i in range(1,8,1):
    temp0_df = pd.read_csv(f"data{i}/train.csv", header=None)
    globals()[f'train{i}'] = temp0_df.iloc[:,:-1]
    globals()[f'answer{i}'] = temp0_df.iloc[:,-1]
    temp1_df = pd.read_csv(f"data{i}/test.csv", header=None)
    globals()[f'test{i}'] = temp1_df.iloc[:,:-1]

In [4]:
for i in range(1,8,1):
    count_null = pd.DataFrame(globals()[f'train{i}'].isnull().sum()).T
    display(count_null)

Unnamed: 0,0,1,2,3,4,5,6,7
0,234,220,218,225,219,239,249,237


Unnamed: 0,0,1,2,3,4,5,6,7
0,167,161,195,179,151,163,167,173


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,110,124,103,103,105,118,124,122,118,105,104,110,102


Unnamed: 0,0,1,2,3,4,5,6,7
0,1755,1864,1834,1811,1764,1789,1850,1893


Unnamed: 0,0,1,2,3
0,2132,2149,2089,2114


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,367,395,362,357,351,327,342,367,346,366,321


Unnamed: 0,0,1,2,3,4,5
0,68,62,74,63,78,68


In [5]:
for i in range(1,8,1):
    count_unique = pd.DataFrame(globals()[f'train{i}'])
    display(count_unique.agg(['nunique']))

Unnamed: 0,0,1,2,3,4,5,6,7
nunique,216,141,104,151,89,204,224,14


Unnamed: 0,0,1,2,3,4,5,6,7
nunique,12,12,7,4,2,4,4,6


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
nunique,259,23,63,2,74,239,204,226,9,51,40,189,252


Unnamed: 0,0,1,2,3,4,5,6,7
nunique,4013,3960,3962,3982,4010,3999,3958,3900


Unnamed: 0,0,1,2,3
nunique,2302,611,2053,3241


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
nunique,90,127,78,76,121,52,131,98,82,81,55


Unnamed: 0,0,1,2,3,4,5
nunique,5,10,8,17,10,14


In [6]:
for i in range(1,8,1):
    count_null = pd.DataFrame(globals()[f'test{i}'].isnull().sum()).T
    display(count_null)

Unnamed: 0,0,1,2,3,4,5,6,7
0,100,91,93,90,86,92,96,95


Unnamed: 0,0,1,2,3,4,5,6,7
0,68,65,75,58,69,70,88,80


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,52,37,50,55,43,42,56,50,42,39,42,40,37


Unnamed: 0,0,1,2,3,4,5,6,7
0,735,731,707,695,764,744,740,732


Unnamed: 0,0,1,2,3
0,829,896,847,876


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,153,124,144,143,163,141,126,145,165,139,168


Unnamed: 0,0,1,2,3,4,5
0,27,34,25,29,27,29


In [7]:
for i in range(1,8,1):
    count_unique = pd.DataFrame(globals()[f'test{i}'])
    display(count_unique.agg(['nunique']))

Unnamed: 0,0,1,2,3,4,5,6,7
nunique,129,77,53,94,65,123,135,11


Unnamed: 0,0,1,2,3,4,5,6,7
nunique,12,12,7,4,2,4,4,6


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
nunique,88,18,44,2,51,93,75,86,9,42,30,77,99


Unnamed: 0,0,1,2,3,4,5,6,7
nunique,1453,1462,1471,1474,1415,1460,1448,1463


Unnamed: 0,0,1,2,3
nunique,1313,531,1171,1481


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
nunique,72,91,69,43,85,44,101,80,69,65,48


Unnamed: 0,0,1,2,3,4,5
nunique,5,9,8,14,10,14


In [29]:
def make_upload(imputer, model=LGBMRegressor(random_state=69), scaler=None, status=False, decimal=4, test_size=0.15, seed=69):
    tp_mean = 0.0
    for i in range(1,8,1):
        np_train, np_answer = np.array(globals()[f'train{i}']), np.array(globals()[f'answer{i}'])
        X_train, X_valid, y_train, y_valid = train_test_split(np_train, np_answer, test_size=test_size, random_state=seed)
        if scaler is True:
            X_train = scaler.fit_transform(X_train)
            X_valid = scaler.fit_transform(X_valid)

        ft_train = imputer.fit_transform(X_train).round(decimal)
        if "fancyimpute" in imputer.__module__:
            ft_valid = imputer.fit_transform(X_valid).round(decimal)
        else: ft_valid = imputer.transform(X_valid).round(decimal)

        model.fit(ft_train, y_train)
        output = model.predict(ft_valid)
        mae = mean_absolute_error(y_valid, output)
        tp_mean += mae
        print(mae)
        # print(mae, ft_valid.shape, end=' ')

        if status is True:
            X_test = np.array(globals()[f'test{i}'])
            if scaler is True: scaler.fit_transform(X_test)

            if "fancyimpute" in imputer.__module__:
                ft_test = imputer.fit_transform(X_test).round(decimal)
            else: ft_test = imputer.transform(X_test).round(decimal)
            # print(ft_test.shape, end='')
            predict = model.predict(ft_test)
            upload_array = np.concatenate((ft_test, predict[:, np.newaxis]), axis=1)
            upload = pd.DataFrame(upload_array).round(decimal)

            SAVE_FOLDER = 'upload_LGBM/{}'.format(str(imputer)[:3].upper())
            os.makedirs(SAVE_FOLDER, exist_ok=True)
            upload.to_csv(f'{SAVE_FOLDER}/{str(imputer)[:3]}_{str(model)[:4]}_upload{i}.csv', header=None, index=None)
        # print()
    print("TP_mean in valid set = {}".format(tp_mean))

In [34]:
make_upload(IterativeImputer(max_iter=2000, tol=1e-5, initial_strategy='median', random_state=69), status=True)

6.86453735472735
1.710290995225952
3.1232899318250955
0.16602472787793354
4.324718267395561
0.5098556503645446
6.835185952630213
TP_mean in valid set = 23.53390288004665


In [36]:
make_upload(KNNImputer(n_neighbors=11, weights="distance"), LGBMRegressor(random_state=69), status = True)

8.32017056629046
2.1882312842329825
3.027530944865036
0.1779470441758356
4.729862839270444
0.5043679983248613
6.432710711761037
TP_mean in valid set = 25.38082138892066


In [39]:
make_upload(SoftImpute(None, 1e-5, 10000, max_rank=10, init_fill_method='zero', verbose=False, seed=69), \
    scaler=BiScaler(scale_rows=False, scale_columns=False, verbose=False), status=True)

8.629261647091015
2.141303217626193
3.27185889052048
0.1687356339187711
4.710636937608187
0.5372014433679237
6.26508024499045
TP_mean in valid set = 25.72407801512302


In [42]:
make_upload(MatrixFactorization(learning_rate=1e-3, patience=20, l2_penalty=1e-4, min_improvement=1e-4, verbose=False), status=True)

8.183885353144138
3.00690934992091
3.9508526096789476
0.1724083943904739
5.111151687157075
0.5462071766867654
6.18077087873067
TP_mean in valid set = 27.152185449708977


In [23]:
# Convert to .txt file
for i in range(1,8,1):
    PATH = "GRAPE/uci/raw_data"
    os.makedirs(f'{PATH}/data{i}/data', exist_ok=True)
    data = globals()[f'train{i}'].copy()
    data.fillna(value = 0, inplace=True)
    data.to_csv(f'{PATH}/data{i}/data/data.txt', header=None, index=None, sep=' ', mode='a')

In [None]:
'''
opt : adam, sgd, rmsprop, adagrad; opt_scheduler : step, cos
'''
!python GRAPE/train_mdi.py --epochs 200 --opt_scheduler cos --opt_decay_step 50 --opt_decay_rate 0.9 --weight_decay 1e-5 \
 --valid 0.1 --save_model --save_prediction uci --data data1

In [None]:
!python GRAPE/train_y.py --epochs 200 --opt_scheduler cos --opt_decay_step 50 --opt_decay_rate 0.9 --weight_decay 1e-5 \
--valid 0.1 uci --data data1

In [27]:
# 檢視 Feature imputation 訓練數據
import pickle
fr = open('uci/test/data1/0/result.pkl', 'rb')
df = pickle.load(fr)

# print('Items of result.pkl: ', df.keys())
# print('\nargs: ', df['args'])
print('\noutputs: ', df['outputs'])
# print('\ncurves: ', df['curves'])
# print('\nlr: ', df['lr'])


outputs:  {'best_valid_rmse_pred_test': array([0.23472586, 0.3429135 , 0.49465045, ..., 0.48028755, 0.48039582,
       0.4804244 ], dtype=float32), 'best_valid_l1_pred_test': array([0.08135975, 0.3113982 , 0.68225396, ..., 0.58054477, 0.6309263 ,
       0.6309227 ], dtype=float32), 'final_pred_train': array([0.17951287, 0.0942234 , 0.3507589 , ..., 0.28867045, 0.20371333,
       0.6800189 ], dtype=float32), 'label_train': array([0.        , 0.        , 0.        , ..., 0.39130434, 0.        ,
       0.7671771 ], dtype=float32), 'final_pred_test': array([0.07680242, 0.29278088, 0.6853763 , ..., 0.5381242 , 0.627933  ,
       0.6432092 ], dtype=float32), 'label_test': array([0.       , 0.       , 0.       , ..., 0.7877193, 0.8798246,
       0.7650443], dtype=float32)}


In [36]:
print(df['outputs'].keys())
for key in df['outputs'].keys():
    print("{} ".format(df['outputs'][key].shape), end='')

dict_keys(['best_valid_rmse_pred_test', 'best_valid_l1_pred_test', 'final_pred_train', 'label_train', 'final_pred_test', 'label_test'])
(1547,) (1547,) (3396,) (3396,) (1547,) (1547,) 

In [None]:
# 檢視 Label prediction 訓練數據
fr = open('uci/test/data1/y0/result.pkl', 'rb')
df = pickle.load(fr)

# print('Items of result.pkl: ', df.keys())
# print('\nargs: ', df['args'])
print('\noutputs: ', df['outputs'])
# print('\ncurves: ', df['curves'])
# print('\nlr: ', df['lr'])

In [38]:
print(df['outputs'].keys())
for key in df['outputs'].keys():
    print("{} ".format(df['outputs'][key].shape), end='')

dict_keys(['pred_train', 'label_train', 'pred_test', 'label_test'])
(454,) (454,) (242,) (242,) 