In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline
import seaborn as sns

from scipy.stats import boxcox, probplot
import matplotlib.pyplot as plt

In [2]:
import dask.dataframe as dd

In [3]:
def mapper(name):
    return {'Unnamed: 0': f'index_{name}', 
            'buy_time': f'buy_time_{name}'}


features_df = dd.read_csv('features.csv',sep='\t').rename(columns=mapper('feat'))
train_df = dd.read_csv('data_train.csv').rename(columns=mapper('vas'))
test_df = dd.read_csv('data_test.csv').rename(columns=mapper('vas'))

In [5]:
train_merge = dd.merge(train_df, features_df, on=['id'], how='inner')

In [6]:
train_feat = train_merge.compute()

In [7]:
def buy_time_clear(df):
    df['time_delta'] = abs(df['buy_time_vas'] - df['buy_time_feat'])
    
    # Отсортируем наблюдения в порядке возрастания 'time_delta'
    df.sort_values(['time_delta'], inplace=True, ignore_index=True)
    
    # удалим по порядку все дубликаты, оставляя лишь те, которые встречаются в датафрейме первыми (keep='first')
    df.drop_duplicates(['index_vas'], keep='first', inplace=True)
    
    # Отсортируем датасет по возрастанию index_vas.
    df.sort_values(['index_vas'], inplace=True, ignore_index=True)
    
    return df    

In [8]:
buy_time_clear(train_feat)

Unnamed: 0,index_vas,id,vas_id,buy_time_vas,target,index_feat,buy_time_feat,0,1,2,...,244,245,246,247,248,249,250,251,252,time_delta
0,0,540968,8.0,1537131600,0.0,3756522,1541970000,-31.559971,327.360888,-45.500786,...,-613.770792,-20.996269,-37.630448,-28.747724,4.167111,7.305572,-12.175933,21.54386,0.0,4838400
1,1,1454121,4.0,1531688400,0.0,2735973,1531083600,547.270029,238.430888,533.329214,...,-613.770792,-25.996269,-19.630448,-278.747724,-24.832889,-0.694428,-11.175933,-0.45614,0.0,604800
2,2,2458816,1.0,1534107600,0.0,3586550,1543179600,-92.139971,-95.469112,-106.080786,...,-613.770792,-25.996269,-37.630448,-304.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0,9072000
3,3,3535012,5.0,1535922000,0.0,1634974,1533502800,54.880029,12.970888,54.079214,...,-613.770792,-25.996269,-18.630448,-133.747724,-14.832889,-0.694428,-1.175933,-0.45614,0.0,2419200
4,4,1693214,1.0,1535922000,0.0,716515,1543179600,45.160029,295.240888,64.679214,...,-612.770792,-22.996269,-32.630448,-127.747724,-4.832889,-0.694428,-12.175933,-0.45614,0.0,7257600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
831648,831648,3812226,2.0,1546203600,0.0,3598646,1532898000,29.750029,6.200888,24.279214,...,-613.770792,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0,13305600
831649,831649,2480469,2.0,1546203600,0.0,1126545,1532293200,-89.179971,-56.499112,-103.120786,...,-321.770792,-25.996269,62.369552,1167.252276,41.167111,-0.694428,54.824067,-0.45614,0.0,13910400
831650,831650,158236,2.0,1546203600,0.0,3755060,1538946000,-96.799971,62.140888,-110.740786,...,-470.770792,-25.996269,-37.630448,99.252276,178.167111,-0.694428,191.824067,-0.45614,0.0,7257600
831651,831651,1825525,2.0,1546203600,0.0,253884,1533502800,-96.799971,-81.919112,-110.740786,...,1367.229208,-24.996269,-35.630448,-237.747724,-21.832889,-0.694428,-8.175933,-0.45614,1.0,12700800


In [9]:
train_feat.drop(['index_vas','index_feat', 'buy_time_feat'], axis=1).astype('float32').to_pickle('data_feat_train.pkl')

In [10]:
test_merge = dd.merge(test_df, features_df, on=['id'], how='inner')
test_feat = test_merge.compute()

In [11]:
buy_time_clear(test_feat)

Unnamed: 0,index_vas,id,vas_id,buy_time_vas,index_feat,buy_time_feat,0,1,2,3,...,244,245,246,247,248,249,250,251,252,time_delta
0,0,3130519,2.0,1548018000,1871362,1536526800,-62.899971,-374.279112,-72.600786,-418.406798,...,-613.770792,-25.996269,-37.630448,-258.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0,11491200
1,1,2000860,4.0,1548018000,2024591,1532293200,-96.799971,100.290888,-110.740786,140.903202,...,-613.770792,-25.996269,-37.630448,-254.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0,15724800
2,2,1099444,2.0,1546808400,4041332,1541365200,-81.969971,-390.729112,-95.910786,-443.336798,...,-613.770792,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,1.0,5443200
3,3,1343255,5.0,1547413200,1945062,1536526800,259.130029,-52.249112,245.189214,-104.856798,...,-613.770792,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,1.0,10886400
4,4,1277040,2.0,1546808400,3503668,1532293200,331.170029,590.890888,317.229214,538.283202,...,-612.770792,1.003731,-36.630448,38.252276,-12.832889,-0.694428,-12.175933,12.54386,0.0,14515200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71226,71226,2502453,5.0,1548018000,820229,1534712400,-96.799971,-408.179112,-110.740786,-460.786798,...,-613.770792,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0,13305600
71227,71227,1693213,2.0,1548018000,3872196,1541970000,-89.689971,-355.809112,-103.630786,-408.416798,...,-613.770792,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0,6048000
71228,71228,1891350,2.0,1548018000,3188331,1545598800,-96.799971,-281.059112,-110.740786,-333.666798,...,1927.229208,-25.996269,-32.630448,-33.747724,-18.832889,-0.694428,-10.175933,-0.45614,0.0,2419200
71229,71229,2437172,2.0,1548018000,1510984,1532293200,115.060029,-87.339112,101.119214,-127.236798,...,-613.770792,-25.996269,-37.630448,-280.747724,119.167111,-0.694428,132.824067,-0.45614,0.0,15724800


In [12]:
test_feat.drop(['index_vas','index_feat'], axis=1).astype('float32').to_pickle('data_feat_test.pkl')

In [11]:
df_train = df_train.sort_values(by="buy_time")
df_test = df_test.sort_values(by="buy_time")
df_features = df_features.compute().sort_values(by="buy_time")

train = pd.merge_asof(df_train, df_features, on='buy_time', by='id', direction='nearest')
valid = pd.merge_asof(df_test, df_features, on='buy_time', by='id', direction ='nearest')

KeyboardInterrupt: 

In [28]:
X_train, X_test, y_train, y_test = train_test_split(df_features, df_train['target'], test_size=0.3, random_state=0)

ValueError: Found input variables with inconsistent numbers of samples: [4512528, 831653]

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [None]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    final_transformers.append((cont_col, cont_transformer))

NameError: name 'categorical_columns' is not defined