In [1]:
import pathlib

path = pathlib.Path().joinpath('data')
data_path = path.joinpath('data.csv')
sub_path = path.joinpath('sample_submission.csv')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from copy import deepcopy
from tqdm import tqdm

In [2]:
data = pd.read_csv(data_path, index_col='row_id')
data.head()

Unnamed: 0_level_0,F_1_0,F_1_1,F_1_2,F_1_3,F_1_4,F_1_5,F_1_6,F_1_7,F_1_8,F_1_9,...,F_4_5,F_4_6,F_4_7,F_4_8,F_4_9,F_4_10,F_4_11,F_4_12,F_4_13,F_4_14
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.354591,-0.464038,2.304115,0.734486,1.696395,0.136285,-0.518344,0.50264,-1.852504,-0.500665,...,3.744152,0.794438,0.265185,-0.561809,0.19648,0.373434,6.206995,3.809505,1.236486,1.182055
1,1.38094,-0.499626,-0.418548,1.911725,-0.82613,-1.715371,-0.577091,-1.041486,0.596067,-0.363425,...,-2.895826,-0.738275,2.361818,-0.060753,0.727249,-0.271882,5.232157,-4.218259,-2.724883,-0.063775
2,0.256023,-1.059874,,0.345678,1.513814,1.243864,-0.509648,-0.800481,-0.115945,0.595777,...,2.252834,0.472496,2.491386,0.353381,-0.260682,-0.000833,-0.116457,-2.131747,3.661499,-0.131576
3,-0.72842,-2.432399,-2.453602,-0.020509,0.333397,0.086049,-1.787601,0.667011,0.761564,-2.217847,...,2.0046,-4.664806,-0.847211,-0.264249,0.664334,-0.557868,8.499483,-4.738799,-3.054611,0.494152
4,0.590212,-0.066127,0.468009,-1.096038,0.119399,-1.80971,0.466358,-0.053196,-0.58032,-1.1435,...,0.976937,2.558883,3.377724,0.846891,0.696032,0.554121,-5.979714,-2.869631,3.733057,-0.722943


## functions

In [3]:
class DataFrameImputer(SimpleImputer):
    def transform(self, X, y=None):
        return pd.DataFrame(super().transform(X), index=X.index, columns=X.columns)

In [4]:
class MatrixFactorImputer:
    def __init__(self, cl, n_comp, **kwargs):
        self.estimator = cl(n_comp, **kwargs)        

    def fit(self, X, y=None):
        return self.estimator.fit(X)
    
    def predict(self, X):
        transformed = self.estimator.transform(X)
        return pd.DataFrame(self.estimator.inverse_transform(transformed), index=X.index, columns=X.columns)

    @staticmethod
    def validate(X, inversed):
        rmse = []
        for col in tqdm(X.columns):
            nan_rows = X[col].isna()

            y_true = X.loc[~nan_rows, col]
            y_pred = inversed.loc[~nan_rows, col]
            mse = mean_squared_error(y_true, y_pred)
            rmse.append(np.sqrt(mse))
        print(f'\n{np.mean(rmse)}')

    def explain(self):
        cumulative = self.estimator.explained_variance_ratio_.cumsum()
        plt.plot(np.arange(self.estimator.n_components) + 1, cumulative)
        plt.title('Explained cumulative variance')
        plt.xlabel('number of component')
        plt.ylabel('explained cumulative variance ratio')
        plt.ylim([0, 1.1])
        plt.show()

In [5]:
# One col = One model approach
def fit_models(pipe, df):
    models = []
    rmse = []
    na_cols = df.columns[df.isna().any()]
    for col in tqdm(na_cols):
        # extract test markers
        nans = df[col].isna()

        # train/valid split
        train = df[~nans].drop(col, axis=1)
        target = df.loc[~nans, col]
        X_train, X_valid, y_train, y_valid = train_test_split(train, target, test_size=0.3, random_state=23)

        # fit model for validation
        pipe.fit(X_train, y_train)
        pred = pipe.predict(X_valid)
        rmse.append(np.sqrt(mean_squared_error(y_valid, pred)))
        # fit final model
        pipe.fit(train, target)
        models.append(deepcopy(pipe))
    # mean RMSE
    print(f'\n{np.mean(rmse)}')
    return models, rmse


def collect_predicts(models, df):
    predict = df.copy()
    na_cols = df.columns[df.isna().any()]
    # fill
    for n, col in tqdm(enumerate(na_cols), total=len(na_cols)):
        # extract test markers
        nans = df[col].isna()
        # predict & fill predicted df
        values = models[n].predict(predict[nans].drop(col, axis=1))
        predict.loc[nans, col] = values
    return predict


## matrix factorization

In [18]:
# # kaggle 1.35
# pca = aux.MatrixFactorImputer(PCA, 3, random_state=7)
# pca.fit(data.fillna(0));
# inv = pca.predict(data.fillna(0))
# pca.validate(data, inv)     # this is not correct validation

In [19]:
# # kaggle 1.42
# pca = aux.MatrixFactorImputer(TruncatedSVD, 40, random_state=7)
# pca.fit(data.fillna(0));
# inv = pca.predict(data.fillna(0))
# pca.validate(data, inv)     # this is not correct validation

## one model per column

In [7]:
pipeline = make_pipeline(
    DataFrameImputer(strategy='mean'),
    # SGDRegressor(random_state=7, 
    #              learning_rate='adaptive', 
    #              eta0=0.001)
    # RandomForestRegressor(random_state=7, 
    #                       n_estimators=10, 
    #                       max_depth=5, 
    #                       min_samples_leaf=1
    #                       )
    LGBMRegressor(random_state=7, n_jobs=-1,
                  n_estimators=200,
                  # max_depth=4,
                  num_leaves=11,
                  )
    # XGBRegressor(random_state=7, objective='reg:squarederror',
    #              max_depth=3, 
    #              learning_rate=0.1, 
    #              n_estimators=30
    #              )
)

In [8]:
models, rmse = fit_models(pipeline, data)
predicted = collect_predicts(models, data)

100%|██████████| 55/55 [26:38<00:00, 29.07s/it]



0.9885745361876273


100%|██████████| 55/55 [00:04<00:00, 11.54it/s]


## predict on fly

In [None]:
# %%time
# rmse = []
# predicted = data.copy()

# # sort columns containing NaN in order of Nans count. If two columns have the same nans count - keep it's original order.
# not_na_cols = predicted.columns[~predicted.isna().any()]
# sorted_na_cols = predicted.drop(not_na_cols, axis=1).isna().sum().reset_index().reset_index().sort_values(by=[0, 'level_0'])['index'].values

# for col in tqdm(sorted_na_cols):
#     # extract test markers
#     nans = predicted[col].isna()

#     # train/valid split
#     train = predicted[~nans].drop(col, axis=1)
#     target = predicted.loc[~nans, col]
#     X_train, X_valid, y_train, y_valid = train_test_split(
#         train, target, test_size=0.2, random_state=23)

#     X_test = predicted[nans].drop(col, axis=1)
#     y_test = predicted.loc[nans, col]

#     # fit model
#     pipeline.fit(X_train, y_train)

#     pred = pipeline.predict(X_valid)
#     models.append(deepcopy(pipeline))
#     rmse.append(np.sqrt(mean_squared_error(y_valid, pred)))
#     # OnFly prediction
#     pred = pipeline.predict(X_test)
#     predicted.loc[nans, col] = pred

# # mean RMSE
# print(f'\n{np.mean(rmse)}')

## submission

In [9]:
# check that there are no NaN
predicted.isna().any().any()

False

In [10]:
# collect predictions
sub = pd.read_csv(sub_path)
predict = sub['row-col'].str.split('-').apply(lambda bundle: predicted.loc[int(bundle[0]), bundle[1]])
sub['value'] = predict

In [11]:
sub.to_csv('baseline_submission.csv', index=False)
sub.head()

Unnamed: 0,row-col,value
0,0-F_1_14,-0.002117
1,0-F_3_23,0.066544
2,1-F_3_24,0.007324
3,2-F_1_2,0.010829
4,2-F_4_2,0.431316


In [12]:
#