In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
tqdm.pandas(desc='Progress')
import matplotlib.pylab as plt
%matplotlib inline
import gc

In [2]:
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')
print(df_train.shape, df_test.shape)

(200000, 202) (200000, 201)


In [None]:
def add_p_n(df, drop_list):
    
    df_ = df.drop(drop_list, axis=1)
    df_list = df_.values

    positives = np.zeros(train.shape[0])
    negatives = np.zeros(train.shape[0])

    for i in range(df_list.shape[0]):
        positive = np.where(df_list[i]>0,df_list[i], 0)
        negative = np.where(df_list[i]<0, df_list[i], 0)
        positives[i] = len(set(positive))-1
        negatives[i] = len(set(negative))-1

    df['positive'] = positives
    df['negative'] = negatives
    
    return df

In [None]:
df_train = add_p_n(df_train, ['ID_code', 'target'])
df_train.head()

In [None]:
df_test = add_p_n(df_test, ['ID_code'])
df_test.head()

In [3]:
private_lb = pd.read_csv("../input/Private_LB.csv")
public_lb = pd.read_csv("../input/Public_LB.csv")
synthetic = pd.read_csv("../input/synthetic_samples_indexes.csv")

private_lb = private_lb.rename(index=str, columns={"Private_LB": "index"})
public_lb = public_lb.rename(index=str, columns={"Public_LB": "index"})
synthetic = synthetic.rename(index=str, columns={"synthetic_samples_indexes": "index"})

true = public_lb.append(private_lb, ignore_index=True)
test_true = df_test.iloc[true["index"], :]
test_synthetic = df_test.iloc[synthetic["index"], :]

In [None]:
test_true['target'] = np.ones(test_true.shape[0])
test_true.head()

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
y_train = df_train['target']

train0 = df_train[y_train.values==0].copy()
train1 = df_train[y_train.values==1].copy()

# Get statistics

In [None]:
sta = np.zeros((6, 200))
for i in range(200):
    sta[0, i] = np.std(df_train['var_'+str(i)])
    sta[1, i] = np.mean(df_train['var_'+str(i)])
    sta[2, i] = np.std(train0['var_'+str(i)])
    sta[3, i] = np.mean(train0['var_'+str(i)])
    sta[4, i] = np.std(train1['var_'+str(i)])
    sta[5, i] = np.mean(train1['var_'+str(i)])

In [None]:
statistic = pd.DataFrame(sta, columns=['var_'+str(i) for i in range(200)])

In [None]:
statistic['item'] = ['std_all', 'mean_all', 'std_0', 'mean_0', 'std_1', 'mean_1']

In [None]:
statistic = statistic.set_index('item')

In [None]:
statistic.to_csv('../input/statistics.csv', index=False)

In [None]:
statistic.head()

In [None]:
print(statistic[['var_150', 'var_153', 'var_158']])

# ECDF

In [None]:
N_FEATURES = 200

def ecdf(s):
    """ An ECDF computation function using pandas methods."""
    value_counts_s = s.value_counts()
    return value_counts_s.sort_index().cumsum().div(len(s))

def optimal_fd_bins(s):
    """ 
    Optimal number of bins using the FD rule of thumb: 
    https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule
    """
    # Computeing the interquartile range: 
    # https://en.wikipedia.org/wiki/Interquartile_range
    q1 = s.quantile(0.25)
    q3 = s.quantile(0.75)
    iqr = q3 - q1
    width = 2 * iqr / (len(s) ** 0.33)
    return int((s.max() - s.min()) / width)

In [None]:
for i in range(N_FEATURES):
    col = 'var_' + str(i)
    fig, ax = plt.subplots(1, 2, figsize=(12, 6))
    
    # ECDF
    ecdf(df_train.loc[lambda df: df.target == 0, col]).plot(ax=ax[0], label="0")
    ecdf(df_train.loc[lambda df: df.target == 1, col]).plot(ax=ax[0], label="1")
    ax[0].set_title(f"ECDF for {col}")
    ax[0].legend()
    
    # Histogram
    bins = optimal_fd_bins(df_train[col])
    df_train.loc[lambda df: df.target == 0, col].plot(kind="hist", bins=bins, ax=ax[1], 
                                                      label="0")
    df_train.loc[lambda df: df.target == 1, col].plot(kind="hist", bins=bins, ax=ax[1], 
                                                      label="1")
    ax[1].set_title(f"Freedman–Diaconis histogram for {col}")
    ax[1].legend()      
    
    plt.show()
    fig.clf()

# Comparing with normal distribution

In [None]:
def plot_unique(train_df):

    for var in ['var_{}'.format(x) for x in range(0, 200)]:
        fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 4))
        train_df.groupby(var)['target'].agg(['count','mean']).sort_values('count') \
            .plot(kind='scatter', x='mean', y='count', ax=ax1, alpha=0.1, title='Train Data')
        train_df['random_{}'.format(var)] = np.random.normal(train_df[var].mean(), train_df[var].std(), train_df.shape[0]).round(4)
        train_df.groupby('random_{}'.format(var))['target'].agg(['count','mean']).sort_values('count') \
            .plot(kind='scatter', x='mean', y='count', ax=ax2, alpha=0.1, title='Simulated Data')
        # Both together
        train_df.groupby(var)['target'].agg(['count','mean']).sort_values('count') \
            .plot(kind='scatter', x='mean', y='count', ax=ax3, alpha=0.1)
        train_df.groupby('random_{}'.format(var))['target'].agg(['count','mean']).sort_values('count') \
            .plot(kind='scatter', x='mean', y='count', ax=ax3, alpha=0.1, color='orange', title='Both')
        ax1.set_xlabel('average target')
        ax2.set_xlabel('average target')
        ax3.set_xlabel('average target')
        ax1.set_ylabel('count of unique value')
        ax2.set_ylabel('count of unique value')
        ax3.set_ylabel('count of unique value')
        fig.suptitle(var)
        plt.show()

In [None]:
plot_unique(train0)

In [None]:
plot_unique(train1)

In [None]:
heavy_wired = [12, 108, 126]

In [None]:
def plot_unique_feature(train_df, var):

    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 4))
    train_df.groupby(var)['target'].agg(['count','mean']).sort_values('count') \
        .plot(kind='scatter', x='mean', y='count', ax=ax1, alpha=0.1, title='Train Data')
    train_df['random_{}'.format(var)] = np.random.normal(train_df[var].mean(), train_df[var].std(), train_df.shape[0]).round(4)
    train_df.groupby('random_{}'.format(var))['target'].agg(['count','mean']).sort_values('count') \
        .plot(kind='scatter', x='mean', y='count', ax=ax2, alpha=0.1, title='Simulated Data')
    # Both together
    train_df.groupby(var)['target'].agg(['count','mean']).sort_values('count') \
        .plot(kind='scatter', x='mean', y='count', ax=ax3, alpha=0.1)
    train_df.groupby('random_{}'.format(var))['target'].agg(['count','mean']).sort_values('count') \
        .plot(kind='scatter', x='mean', y='count', ax=ax3, alpha=0.1, color='orange', title='Both')
    ax1.set_xlabel('average target')
    ax2.set_xlabel('average target')
    ax3.set_xlabel('average target')
    ax1.set_ylabel('count of unique value')
    ax2.set_ylabel('count of unique value')
    ax3.set_ylabel('count of unique value')
    fig.suptitle(var)
    plt.show()

In [None]:
mean = df_train['var_12'].mean()
std = df_train['var_12'].std()
df_train['var_12'] = df_train['var_12'].apply(lambda x:(x-mean)/std)

In [None]:
plot_unique_feature(df_train, 'var_12')

In [None]:
plot_unique_feature(test_true, 'var_12')

In [None]:
max_12 = np.max(df_train['var_12'])
max_gaussian_12 = np.max(np.random.normal(df_train['var_12'].mean(), \
                                          df_train['var_12'].std(), df_train.shape[0]).round(4))

print(max_12)
print(max_gaussian_12)

min_12 = np.min(df_train['var_12'])
min_gaussian_12 = np.min(np.random.normal(df_train['var_12'].mean(), \
                                          df_train['var_12'].std(), df_train.shape[0]).round(4))

print(min_12)
print(min_gaussian_12)

In [None]:
print(statistic['var_12'])

In [None]:
print(np.unique(df_train['var_12']).shape)

In [None]:
print(np.unique(np.random.normal(df_train['var_12'].mean(), \
                                          df_train['var_12'].std(), df_train.shape[0]).round(4)).shape)

In [None]:
new_df = df_train.groupby(['var_12'])['target'].agg(['count','mean']).sort_values('count', ascending=False)
print(new_df)

In [None]:
new_df_test = test_true.groupby(['var_12'])['target'].agg(['count','mean']).sort_values('count', ascending=False)
print(new_df_test)

In [None]:
new_gauss = df_train.groupby(['random_var_12'])['target'].agg(['count','mean']).sort_values('count', ascending=False)
print(new_gauss)

In [None]:
new_gauss_test = test_true.groupby(['random_var_12'])['target'].agg(['count','mean']).sort_values('count', ascending=False)
print(new_gauss_test)

In [None]:
new_gauss = new_gauss.reset_index()

In [None]:
print(new_gauss[new_gauss['random_var_12']==13.5540])

In [None]:
new_df = new_df.reset_index()
print(new_df[new_df['var_12']==13.5540])

In [None]:
plot_unique_feature(df_train, 'var_0')

In [None]:
new_df = df_train.groupby(['var_0'])['target'].agg(['count','mean']).sort_values('count', ascending=False)
print(new_df)

In [None]:
new_gauss = df_train.groupby(['random_var_0'])['target'].agg(['count','mean']).sort_values('count', ascending=False)
print(new_gauss)

In [None]:
plot_unique_feature(df_train, 'var_108')

In [None]:
plot_unique_feature(df_train, 'var_126')

In [None]:
df_train['new'] = df_train['var_102'] + df_train['var_10']

# Analyzing Wierd

In [None]:
import seaborn as sns
print(statistic[['var_12', 'var_108', 'var_126']])

In [None]:
def plot_dis(name):
    #xx = plt.xlim()
    sns.distplot(train0[name], label = 't=0')
    sns.distplot(train1[name], label = 't=1')
    plt.title(name)
    plt.legend()
    #plt.xlim(xx)
    plt.xlabel('')

In [None]:
plot_dis('var_12')

In [None]:
def plot_new(df_train, new_train0, new_train1, name):
    xx = plt.xlim()
    s_new = [0]
    m_new = [0]

    # CALCULATE MEANS AND STANDARD DEVIATIONS
    s_new[0] = np.std(df_train[name])
    m_new[0] = np.mean(df_train[name])

    # CALCULATE PROB(TARGET=1 | X)
    def getp_new(name, x):
        c = 3 #smoothing factor
        a = len( new_train1[ (new_train1[name]>x-s_new[0]/c)&(new_train1[name]<x+s_new[0]/c) ] ) 
        b = len( new_train0[ (new_train0[name]>x-s_new[0]/c)&(new_train0[name]<x+s_new[0]/c) ] )
        if a+b<500: return 0.1 #smoothing factor
        # RETURN PROBABILITY
        return a / (a+b)
        # ALTERNATIVELY RETURN ODDS
        # return a / b

    # SMOOTH A DISCRETE FUNCTION
    def smooth(x,st=1):
        for j in range(st):
            x2 = np.ones(len(x)) * 0.1
            for i in range(len(x)-2):
                x2[i+1] = 0.25*x[i]+0.5*x[i+1]+0.25*x[i+2]
            x = x2.copy()
        return x

    rmin_new=-5; rmax_new=5; 
    # CALCULATE PROBABILITIES FOR 501 BINS
    res_new=501

    pr_new = 0.1 * np.ones(res_new)
    pr2_new = pr_new.copy()
    xr_new = np.zeros(res_new)
    xr2_new = xr_new.copy()
    ct2_new = 0

    ct_new = 0
    # CALCULATE PROBABILITY FUNCTION FOR VAR
    for i in np.linspace(rmin_new,rmax_new,res_new):
        pr_new[ct_new] = getp_new(name, m_new[0]+i*s_new[0])
        xr_new[ct_new] = m_new[0]+i*s_new[0]
        xr2_new[ct_new] = i
        ct_new += 1

    # SMOOTH FUNCTION FOR PRETTIER DISPLAY
    # BUT USE UNSMOOTHED FUNCTION FOR PREDICTION
    pr2_new[:] = smooth(pr_new[:],res_new//10)

    # DISPLAY PROBABILITY FUNCTION
    plt.plot(xr_new[:],pr2_new[:],'-')
    plt.title('P( t=1 | ' + name + ' )')

In [None]:
plot_new(df_train, train0, train1, 'var_12')

In [None]:
plot_new(df_train, train0, train1, 'var_75')

In [None]:
plot_dis('var_108')

In [None]:
plot_dis('var_126')

In [None]:
def transform(df, var='var_12'):
    df['random_{}'.format(var)] = np.random.normal(df[var].mean(), df[var].std(), 200000).round(4)
    var_counts = pd.DataFrame(df.groupby(var)['ID_code'].count()).reset_index()
    var_counts_random = pd.DataFrame(df.groupby('random_{}'.format(var))['ID_code'].count()).reset_index()
    merged_counts = pd.merge(var_counts, var_counts_random, left_on=var, right_on='random_{}'.format(var))
    merged_counts['diff'] = merged_counts['ID_code_x'] - merged_counts['ID_code_y']
    df['{}_diff_normal_dist'.format(var)] = df.merge(merged_counts[[var,'diff']], how='left')['diff']
    df = df.drop('random_{}'.format(var), axis=1)
    return df

# Loop and add features
for var in tqdm(['var_{}'.format(x) for x in range(0, 200)]):
    train_df = transform(train_df, var=var)
    test_df = transform(test_df, var=var)

# New_feature

In [4]:
import seaborn as sns

In [5]:
train_new = pd.read_csv('../input/train.csv')

In [6]:
def logloss(y,yp):
    yp = np.clip(yp,1e-5,1-1e-5)
    return -y*np.log(yp)-(1-y)*np.log(1-yp)
    
def reverse(tr):
    reverse_list = [0,1,2,3,4,5,6,7,8,11,15,16,18,19,
                22,24,25,26,27,41,29,
                32,35,37,40,48,49,47,
                55,51,52,53,60,61,62,103,65,66,67,69,
                70,71,74,78,79,
                82,84,89,90,91,94,95,96,97,99,
                105,106,110,111,112,118,119,125,128,
                130,133,134,135,137,138,
                140,144,145,147,151,155,157,159,
                161,162,163,164,167,168,
                170,171,173,175,176,179,
                180,181,184,185,187,189,
                190,191,195,196,199]
    reverse_list = ['var_%d'%i for i in reverse_list]
    for col in reverse_list:
        tr[col] = tr[col]*(-1)
        
    return tr

def scale(tr):
    for col in tr.columns:
        if col.startswith('var_'):
            mean,std = tr[col].mean(),tr[col].std()
            tr[col] = (tr[col]-mean)/std
    return tr

def getp_vec_sum(x,x_sort,y,std,c=0.5):
    # x is sorted
    left = x - std/c
    right = x + std/c
    p_left = np.searchsorted(x_sort,left)
    p_right = np.searchsorted(x_sort,right)
    p_right[p_right>=y.shape[0]] = y.shape[0]-1
    p_left[p_left>=y.shape[0]] = y.shape[0]-1
    return (y[p_right]-y[p_left])

def get_pdf(tr,col,x_query=None,smooth=3):
    xx = plt.xlim()
    std = tr[col].std()
    df = tr.groupby(col).agg({'target':['sum','count']})
    cols = ['sum_y','count_y']
    df.columns = cols
    df = df.reset_index()
    df = df.sort_values(col)
    y,c = cols
    
    df[y] = df[y].cumsum()
    df[c] = df[c].cumsum()
    
    if x_query is None:
        rmin,rmax,res = -5.0, 5.0, 501
        x_query = np.linspace(rmin,rmax,res)
    
    dg = pd.DataFrame()
    tm = getp_vec_sum(x_query,df[col].values,df[y].values,std,c=smooth)
    cm = getp_vec_sum(x_query,df[col].values,df[c].values,std,c=smooth)+1
    dg['res'] = tm/cm
    dg.loc[cm<500,'res'] = 0.1
    return dg['res'].values

def get_pdfs(tr):
    y = []
    for i in range(200):
        name = 'var_%d'%i
        res = get_pdf(tr,name)
        y.append(res)
    return np.vstack(y)

def print_corr(corr_mat,col,bar=0.97):
    print(col)
    cols = corr_mat.loc[corr_mat[col]>bar,col].index.values
    cols_ = ['var_%s'%(i.split('_')[-1]) for i in cols]
    print(cols)
    return cols

In [7]:
def processing_new(df, cols, df_add):
    
    total_cols = cols.copy()
    total_cols.append('ID_code')
    tmp = pd.concat([df[total_cols], df_add[total_cols]], axis=0)
    for feature in cols:
        print(feature)
       
        #df[feature] = df[feature].round(4)
        count_max = tmp[feature].value_counts().index[0]
        tmp[feature+'_map'] = tmp.groupby([feature])['ID_code'].transform('count')
        size = tmp.shape[0]
        tmp[feature+'_map'] = tmp[feature+'_map'].apply(lambda x: size/x)
        tmp[feature+'_map'] = tmp[feature+'_map']/(np.abs(tmp[feature] - count_max)+1e-8)
        #tmp[feature+'_map'] = tmp[feature+'_map'].apply(lambda x: 0 if x>2 else x)
        df[feature+'_map'] = tmp.iloc[:df.shape[0]][feature+'_map']
         
    return df

def processing_new_new(df, cols, df_add):
    
    def trans(x, m, w):
        if(x in w.index):
            return w[x]
        else:
            return m[x]
    
    for feature in cols:
        print(feature)
        
        df_add_count = df_add[feature].value_counts()/df_add.shape[0]
        df_count = df[feature].value_counts()/df.shape[0]
        
        #df[feature+'_map'] = df.groupby([feature])['ID_code'].transform('count')
        #size = tmp.shape[0]
        #df[feature+'_map'] /= size
        #df[feature+'_map'] = (df[feature] + np.random.normal(1e-15, 1e-15, df.shape[0])).round(16)
        
        #df[feature+'_map'] = df[feature+'_map']-df[feature+'_map_test']
        
        #df = df.drop([feature+'_map_test'], axis=1)
         
    return df

#train_new = scale(train_new)
train_new = processing_new_new(train_new, ['var_12', 'var_108', 'var_126'], test_true)

var_12
var_108
var_126


In [8]:
def plot_hist(df, feature):
    negData = df.loc[df['target'] == 0][feature]
    posData = df.loc[df['target'] == 1][feature]

    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    outs1, outs2, outs3 = ax.hist([negData, posData], 
                                  bins=30, 
                                  density = True, 
                                  histtype='step', 
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)

    fig.show()
    plt.show()

In [9]:
def plot_hist_train_test_fre(train_new, test_true, feature):
    
    tmp1 = train_new[feature].value_counts()
    tmp2 = test_true[feature].value_counts()
    negData = train_new[feature].apply(lambda x:tmp1.loc[x]/train_new.shape[0])
    posData = test_true[feature].apply(lambda x:tmp2.loc[x]/test_true.shape[0])

    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    outs1, outs2, outs3 = ax.hist([negData*10000, posData*10000], 
                                  bins=30, 
                                  density = True, 
                                  histtype='step', 
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)

    fig.show()
    plt.show()

In [10]:
def plot_hist_train_fre(train_new, feature):
    
    a = train_new.loc[train_new['target']==1][feature]
    b = train_new.loc[train_new['target']==0][feature]
    
    tmp1 = a.value_counts()
    tmp2 = b.value_counts()
    negData = a.apply(lambda x:tmp1.loc[x]/train_new.shape[0])
    posData = b.apply(lambda x:tmp2.loc[x]/train_new.shape[0])

    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    outs1, outs2, outs3 = ax.hist([negData*10000, posData*10000], 
                                  bins=40, 
                                  density = True, 
                                  histtype='step', 
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)

    fig.show()
    plt.show()

In [11]:
def plot_hist_ratio(train_new, feature):
    
    a = train_new.loc[train_new['target']==1][feature]
    b = train_new.loc[train_new['target']==0][feature]
    
    tmp1 = a.value_counts()
    tmp2 = b.value_counts()
    
    def ratio(x, tmp1, tmp2):
        if(x not in tmp2.index):
            return 1
        else:
            return tmp1.loc[x]/tmp2.loc[x]
    
    Data = a.apply(lambda x:ratio(x, tmp1, tmp2))
    

    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    outs1, outs2, outs3 = ax.hist([Data], 
                                  bins=30, 
                                  density = True, 
                                  histtype='step', 
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)

    fig.show()
    plt.show()

In [None]:
plot_hist_ratio(train_new, 'var_0')

In [None]:
plot_hist_train_fre(train_new, 'var_0')

In [None]:
plot_hist_train_test_fre(train_new, test_true, 'var_12')

In [None]:
train_new.head()

In [None]:
corrs = train_new.corr().abs().unstack().sort_values(kind="quicksort").reset_index()
corrs = corrs[corrs['level_0'] < corrs['level_1']]
corrs.tail(30)

In [25]:
y_train = train_new['target']

train0 = train_new[y_train.values==0].copy()
train1 = train_new[y_train.values==1].copy()

In [None]:
train1.head()

In [None]:
c = ['var_' + str(i) for i in range(200)]
count = []
index = 13
for col in c:
    if train_new[col].value_counts()[train_new.iloc[index][col]]==1:
        count.append(col)
print(count, train_new.iloc[index]['target'])

In [None]:
arr = []
for i in train1_count.index:
    if i not in train0_count.index:
        arr.append(i)

In [None]:
14.779000000000002 - 14.7790

In [None]:
for i in arr:
    if i not in test_.index:
        print(i)

In [None]:
for i in train0_count.index:
    if i not in train1_count.index:
        print(i)

In [None]:
test_ = test_true['var_12'].value_counts()
print(test_/test_true.shape[0])

In [None]:
train1_count = train1['var_12'].value_counts()
print(train1_count/train_new.shape[0])

In [None]:
train0_count = train0['var_12'].value_counts()
print(train0_count/train_new.shape[0])

In [12]:
def plot_hist_all(df, test, test_synthetic, feature):
    negData = df.loc[df['target'] == 0][feature]
    posData = df.loc[df['target'] == 1][feature]

    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    outs1, outs2, outs3 = ax.hist([df[feature], test[feature], test_synthetic[feature], negData, posData], 
                                  bins=30, 
                                  density = True, 
                                  histtype='step', 
                                  label=("train", "test_true",  "test_synthetic", "target 0", "target 1"),
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)
    handles, labels=ax.get_legend_handles_labels()
    ax.legend(handles, labels)
    fig.show()
    plt.show()

In [13]:
def plot_hist_train_test(train_new, test_true, test_synthetic, feature):
    negData = train_new[feature]
    posData = test_true[feature]

    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    outs1, outs2, outs3 = ax.hist([negData, posData, test_synthetic[feature]], 
                                  bins=30, 
                                  density = True, 
                                  histtype='step', 
                                  label=("train", "test_true", "test_synthetic"),
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)
    handles, labels=ax.get_legend_handles_labels()
    ax.legend(handles, labels)
    fig.show()
    plt.show()

In [14]:
def plot_hist_test(test_true, test_synthetic, feature):
 
    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    outs1, outs2, outs3 = ax.hist([test_true[feature], test_synthetic[feature]], 
                                  bins=30, 
                                  density = True, 
                                  histtype='step', 
                                  label=("test_true", "test_synthetic"),
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)
    handles, labels=ax.get_legend_handles_labels()
    ax.legend(handles, labels)
    fig.show()
    plt.show()

In [15]:
def plot_hist_test_all(train, test, feature):
 
    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    outs1, outs2, outs3 = ax.hist([train[feature], test[feature]], 
                                  bins=30, 
                                  density = True, 
                                  histtype='step', 
                                  label=("train", "test"),
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)
    handles, labels=ax.get_legend_handles_labels()
    ax.legend(handles, labels)
    fig.show()
    plt.show()

In [16]:
def plot_hist_train_test_true(train_new, test_true, feature):
    negData = train_new[feature]
    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    outs1, outs2, outs3 = ax.hist([negData, test_true[feature]], 
                                  bins=30, 
                                  density = True, 
                                  histtype='step', 
                                  label=("train", "test_true"),
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)
    handles, labels=ax.get_legend_handles_labels()
    ax.legend(handles, labels)
    fig.show()
    plt.show()

In [17]:
def plot_hist_train_test_syn(train_new, test_synthetic, feature):
    negData = train_new[feature]
    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    outs1, outs2, outs3 = ax.hist([negData, test_synthetic[feature]], 
                                  bins=30, 
                                  density = True, 
                                  histtype='step', 
                                  label=("train", "test_synthetic"),
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)
    handles, labels=ax.get_legend_handles_labels()
    ax.legend(handles, labels)
    fig.show()
    plt.show()

In [18]:
def get_gradient(df, feature):
        
    bin_ = 1000
    hist_random, bin_edges_random = np.histogram(df[feature], bins=bin_, density=True)
    
    def get_gradient(x, hist_random, bin_edges_random):
        
        if((np.searchsorted(bin_edges_random,x)-1!=0)and(np.searchsorted(bin_edges_random,x)-1!=bin_-1)):
            
            t_2 = hist_random[np.searchsorted(bin_edges_random,x)]
            t_1 = hist_random[np.searchsorted(bin_edges_random,x)-1]
            t_0 = hist_random[np.searchsorted(bin_edges_random,x)-2]
            
            if(t_1-t_0==0):
                if(t_2-t_1==0):
                    return 1
                else:
                    return 100
            
            return (t_2-t_1)/(t_1-t_0)
        
        if(np.searchsorted(bin_edges_random,x)-1==0): 
            return 100
            
        if(np.searchsorted(bin_edges_random,x)-1==bin_-1):
            return -100
            
    df['gradient_'+feature] = df[feature].apply(lambda x: get_gradient(x, hist_random, bin_edges_random))
                                                
    return df

In [None]:
train_new = get_gradient(train_new, 'var_13')
train_new = get_gradient(train_new, 'random_var_13')

In [None]:
train_new.head()

In [None]:
train_new['gradient_var_13'].value_counts()

In [None]:
train_new.head()

In [None]:
def plot_hist_target(df, feature):
    negData = df.loc[df['target'] == 0][feature]
    posData = df.loc[df['target'] == 1][feature]

    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    outs1, outs2, outs3 = ax.hist([negData, posData], 
                                  bins=100, 
                                  density = True, 
                                  histtype='step', 
                                  label=("target 0", "target 1"),
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)
    handles, labels=ax.get_legend_handles_labels()
    ax.legend(handles, labels)
    fig.show()
    plt.show()

In [None]:
plot_hist_target(train_new, 'gradient_var_13')

In [None]:
plot_hist_target(train_new, 'gradient_random_var_13')

In [None]:
plot_dis_hist(train_new, 'gradient_var_13')

In [None]:
plot_dis_hist(train_new, 'gradient_random_var_13')

In [19]:
def plot_dis_hist(df, var):
    
    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(var)
    outs1, outs2, outs3 = ax.hist([df[var]], 
                                  bins=30, 
                                  density = True, 
                                  histtype='step', 
                                  label=(var),
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)
    handles, labels=ax.get_legend_handles_labels()
    ax.legend(handles, labels)
    fig.show()
    plt.show()

In [None]:
plot_hist_test(test_true, test_synthetic, 'var_13')

In [20]:
def plot_hist_target_true(df, test, feature):
    negData = df.loc[df['target'] == 0][feature]
    posData = df.loc[df['target'] == 1][feature]

    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    outs1, outs2, outs3 = ax.hist([test[feature], negData, posData], 
                                  bins=30, 
                                  density = True, 
                                  histtype='step', 
                                  label=("test_true", "target 0", "target 1"),
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)
    handles, labels=ax.get_legend_handles_labels()
    ax.legend(handles, labels)
    fig.show()
    plt.show()

In [21]:
def plot_hist_target_synthetic(df, test_synthetic, feature):
    negData = df.loc[df['target'] == 0][feature]
    posData = df.loc[df['target'] == 1][feature]

    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    outs1, outs2, outs3 = ax.hist([test_synthetic[feature], negData, posData], 
                                  bins=30, 
                                  density = True, 
                                  histtype='step', 
                                  label=("test_synthetic", "target 0", "target 1"),
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)
    handles, labels=ax.get_legend_handles_labels()
    ax.legend(handles, labels)
    fig.show()
    plt.show()

In [None]:
plot_hist_train_test_syn(train_new, test_synthetic, 'var_13')

In [None]:
plot_hist_train_test_true(train_new, test_true, 'var_13')

In [None]:
plot_hist_test_all(df_train, df_test, 'var_13')

In [22]:
def plot_random_diff(train_new, feature):
    
    train_new['random_'+feature] = np.random.normal(train_new[feature].mean(), \
                                                  train_new[feature].std(), train_new.shape[0]).round(4)

    negData = train_new[feature]
    posData = train_new['random_'+feature]

    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    outs1, outs2, outs3 = ax.hist([negData, posData, df_test[feature]], 
                                  bins=30, 
                                  density = True, 
                                  histtype='step', 
                                  label=(feature, "random_"+feature, "test"),
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)
    handles, labels=ax.get_legend_handles_labels()
    ax.legend(handles, labels)
    fig.show()
    plt.show()

In [None]:
plot_random_diff(train_new, 'var_12')

In [None]:
plot_hist_target(train_new, 'var_12')

In [None]:
plot_hist_all(train_new, test_true, test_synthetic, 'var_7')

In [None]:
plot_hist_target(train_new, 'var_13')

In [None]:
plot_hist_train_test(train_new, test_true, test_synthetic, 'var_0')

In [None]:
plot_hist_target_true(train_new, test_true, 'var_7')

In [None]:
plot_hist_target_synthetic(train_new, test_synthetic, 'var_7')

In [None]:
plot_hist_target_test(train_new, df_test, 'var_7')

In [None]:
def transform(df, add_0, add_1, var):
    
    total_cols = [var]
    total_cols.append('ID_code')
    tmp = pd.concat([df[total_cols], add_0[total_cols], add_1[total_cols]], axis=0)
    
    tmp['random_{}'.format(var)] = np.random.normal(tmp[var].mean(), tmp[var].std(), tmp.shape[0]).round(4)
    var_counts = pd.DataFrame(tmp.groupby(var)['ID_code'].count()).reset_index()
    var_counts_random = pd.DataFrame(tmp.groupby('random_{}'.format(var))['ID_code'].count()).reset_index()
    merged_counts = pd.merge(var_counts, var_counts_random, left_on=var, right_on='random_{}'.format(var))
    merged_counts['diff'] = merged_counts['ID_code_x'] - merged_counts['ID_code_y']
    tmp['{}_diff_normal_dist'.format(var)] = tmp.merge(merged_counts[[var,'diff']], how='left')['diff']
    df['{}_diff_normal_dist'.format(var)] = tmp.iloc[:df.shape[0]]['{}_diff_normal_dist'.format(var)]
    #df = df.drop('random_{}'.format(var), axis=1)
    return df

train_new = transform(train_new, test_true, test_synthetic, 'var_12')
test_true = transform(test_true, train_new, test_synthetic, 'var_12')
test_synthetic = transform(test_synthetic, train_new, test_true, 'var_12')

In [None]:
def transform_fre(df, add_0, add_1, var):
    
    tmp = pd.concat([df[var], add_0[var], add_1[var]], axis=0)
    size = tmp.shape[0]
    df['test_'+var] = df[var].map(dict(df[var].value_counts()/size))
    
    return df

train_new = transform_fre(train_new, test_true, test_synthetic, 'var_12')
test_true = transform_fre(test_true, train_new, test_synthetic, 'var_12')
test_synthetic = transform_fre(test_synthetic, train_new, test_true, 'var_12')

In [None]:
important = ['var_12']
tmp = pd.DataFrame(train_new.groupby(important)['target'].mean()).sort_values(by='target', ascending=False).reset_index()

In [None]:
tmp['count'] = np.zeros(tmp.shape[0])

for feature in important:
    
    tmp['count'] += tmp[feature].map(dict(tmp[feature].value_counts()/tmp.shape[0]))

tmp = tmp.sort_values(by='count', ascending=False)
    
print(tmp.head(100))

In [None]:
important = ['var_' + str(i) for i in range(200)]
train_new['count'] = np.zeros(train_new.shape[0])
size = train_new.shape[0]
for feature in important:
    
    train_new[feature+'_count'] = train_new.groupby([feature])['ID_code'].transform('count')/size
    train_new['count'] += train_new[feature+'_count']

In [None]:
for feature in important:
    
    train_new[feature+'_count'] -= train_new['count']

In [None]:
plot_hist_target(train_new, 'count')

In [None]:
def transform_fre_dis(df, add_0, add_1, cols):
    
    total_cols = cols.copy()
    total_cols.append('ID_code')
    tmp = pd.concat([df[total_cols], add_0[total_cols], add_1[total_cols]], axis=0)
    print(tmp.shape)
    tmp['count'] = np.zeros(tmp.shape[0])
    size = tmp.shape[0]
    
    for var in cols:
        
        print(var)
    
        tmp[var+'_count'] = tmp.groupby([var])['ID_code'].transform('count')/size
        mode = tmp[var].mode()
        mean = tmp[var].mean()
        std = tmp[var].std()
        #tmp[var+'_dis'] = tmp[var].apply(lambda x:abs(x-mode)/std)
        #tmp[var+'_count'] = tmp[var+'_dis']/tmp[var+'_count']
        df[var+'_count'] = tmp.iloc[:df.shape[0]][var+'_count']
        #df[var+'_dis'] = tmp.iloc[:df.shape[0]][var+'_dis']
        
        add_0[var+'_count'] = tmp.iloc[df.shape[0]:df.shape[0]+add_0.shape[0]][var+'_count']
        #add_0[var+'_dis'] = tmp.iloc[df.shape[0]:df.shape[0]+add_0.shape[0]][var+'_dis']
        
        add_1[var+'_count'] = tmp.iloc[df.shape[0]+add_0.shape[0]:][var+'_count']
        #add_1[var+'_dis'] = tmp.iloc[df.shape[0]+add_0.shape[0]:][var+'_dis']
        
        
    return df, add_0, add_1


feature = 'var_12'

train_new, test_true, test_synthetic = transform_fre_dis(train_new, test_true, test_synthetic, [feature])

In [None]:
plot_hist_target(train_new, feature+'_dis')

In [None]:
plot_hist_target(train_new, feature+'_count')

In [None]:
tmp = train_new[['var_12', 'var_12_count', 'var_12_dis', 'target']]
tmp = tmp.groupby('var_12')[['var_12_count', 'var_12_dis', 'target']].mean().sort_values(by='var_12_count', ascending=False)
tmp.head(100)

In [None]:
plot_dis_hist(train_new, 'var_12_count')

In [None]:
def transform_(df, add_0, add_1, var):
    
    total_cols = [var]
    total_cols.append('ID_code')
    tmp = pd.concat([df[total_cols]], axis=0)
    
    tmp['random_{}'.format(var)] = np.random.normal(tmp[var].mean(), tmp[var].std(), tmp.shape[0]).round(4)
    var_counts = pd.DataFrame(tmp.groupby(var)['ID_code'].count()).reset_index()
    var_counts_random = pd.DataFrame(tmp.groupby('random_{}'.format(var))['ID_code'].count()).reset_index()
    merged_counts = pd.merge(var_counts, var_counts_random, left_on=var, right_on='random_{}'.format(var))
    merged_counts['diff'] = merged_counts['ID_code_x'] - merged_counts['ID_code_y']
    tmp['{}_diff_normal_dist'.format(var)] = tmp.merge(merged_counts[[var,'diff']], how='left')['diff']
    df['{}_diff_normal_dist'.format(var)] = tmp.iloc[:df.shape[0]]['{}_diff_normal_dist'.format(var)]/tmp.shape[0]
    #df = df.drop('random_{}'.format(var), axis=1)
    return df

train_new = transform_(train_new, test_true, test_synthetic, 'var_12')
test_true = transform_(test_true, train_new, test_synthetic, 'var_12')
test_synthetic = transform_(test_synthetic, train_new, test_true, 'var_12')

In [None]:
plot_hist_all(train_new, test_true, test_synthetic, 'test_var_12')

In [None]:
plot_hist_target(train_new, 'test_var_12')

In [None]:
plot_hist_target(train_new, 'var_12')

In [None]:
plot_hist_all(train_new, test_true, test_synthetic, 'new_var_12')

In [None]:
plot_hist_train_test_true(train_new, test_true, 'new_var_12')

In [None]:
plot_hist_train_test_syn(train_new, test_synthetic, 'new_var_12')

In [None]:
plot_hist_target(train_new, 'new_var_12')

In [None]:
plot_hist_target(train_new, 'fe_var_12')

In [23]:
def plot_hist_target(df, feature):
    negData = df.loc[df['target'] == 0][feature]
    posData = df.loc[df['target'] == 1][feature]

    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    outs1, outs2, outs3 = ax.hist([negData, posData], 
                                  bins=30, 
                                  density = True, 
                                  histtype='step', 
                                  label=("target 0", "target 1"),
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)
    handles, labels=ax.get_legend_handles_labels()
    ax.legend(handles, labels)
    fig.show()
    plt.show()

In [24]:
def plot_hist_target_test(df, test, feature):
    negData = df.loc[df['target'] == 0][feature]
    posData = df.loc[df['target'] == 1][feature]

    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    outs1, outs2, outs3 = ax.hist([test[feature], negData, posData], 
                                  bins=30, 
                                  density = True, 
                                  histtype='step', 
                                  label=("test", "target 0", "target 1"),
                                  linewidth=3)
    ax.set_xticks(outs2)
    ax.xaxis.grid(True)
    handles, labels=ax.get_legend_handles_labels()
    ax.legend(handles, labels)
    fig.show()
    plt.show()

In [None]:
plot_hist_train_test(train_new, test_true, 'var_81')

In [None]:
plot_hist(train_new, 'var_81')

In [None]:
def plot_hist_count(train_new, feature):
    
    
    Data = train_new[feature].value_counts() 

    fig, ax = plt.subplots(ncols = 1, figsize=(20,5))

    fig.suptitle(feature)
    plt.bar(Data.index, Data.values, align='center')

    fig.show()
    plt.show()

In [None]:
plot_hist_count(train_new, 'var_3')

In [None]:
print(train_new['var_3'].value_counts())

In [None]:
def plot_dis(name):
    xx = plt.xlim(11, 16)
    sns.distplot(train0[name], label = 't=0')
    sns.distplot(train1[name], label = 't=1')
    plt.title(name)
    plt.legend()
    plt.xlim(xx)
    plt.xlabel('')

In [None]:
plot_dis('var_126')

In [None]:
plot_dis('var_126_map')

In [None]:
prob = get_pdf(train_new,'var_126_map')
plt.plot(prob)

In [None]:
print(np.unique(train_new[train_new['target']==1]['var_126']).shape)

In [None]:
train_new['var_126'].value_counts()

# Difference in train and test

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
tqdm.pandas(desc='Progress')
import matplotlib.pylab as plt
%matplotlib inline
import gc

test = pd.read_csv('../input/test.csv')
df_train = pd.read_csv('../input/train.csv')
private_lb = pd.read_csv("../input/Private_LB.csv")
public_lb = pd.read_csv("../input/Public_LB.csv")
synthetic = pd.read_csv("../input/synthetic_samples_indexes.csv")

private_lb = private_lb.rename(index=str, columns={"Private_LB": "index"})
public_lb = public_lb.rename(index=str, columns={"Public_LB": "index"})
synthetic = synthetic.rename(index=str, columns={"synthetic_samples_indexes": "index"})

true = public_lb.append(private_lb, ignore_index=True)

test_true = test.iloc[true["index"], :]
test_synthetic = test.iloc[synthetic["index"], :]

In [None]:
total_cols = [c for c in df_train.columns if c not in ['target']]
tmp = pd.concat([df_train[total_cols], test_true[total_cols]], axis=0)
x = df_train['var_0'].value_counts()
y = test_true['var_0'].value_counts()

In [None]:
x /= df_train.shape[0]
y /= test_true.shape[0]

In [None]:
m = pd.DataFrame(x.index, columns=['id'])
m['mean'] = m['id'].apply(lambda x:df_train[df_train['var_0']==x]['target'].mean())

In [None]:
m = m.sort_values(by='mean', ascending=False)
m.head(20)

In [None]:
x[13.9072]*df_train.shape[0]

In [None]:
x.head(20)

In [None]:
y.head(20)

In [None]:
w = tmp['var_0'].value_counts()
#w = w.where(w<2, 0)
w /= tmp.shape[0]
#for idx in w.index:
    #w[idx] = w[idx]/(abs(idx-w.index[0])+1e-8)

w = w.sort_values(ascending=False)
w.head(20)

In [None]:
arr = []
i = 0
while (len(arr)<=100):
    idx = w.index[i]
    if(idx in x.index):
        #print(df_train[df_train['var_12']==idx]['target'].mean())
        arr.append(df_train[df_train['var_0']==idx]['target'].mean())
    i += 1
print(np.mean(arr))

In [None]:
z = x.copy()
for idx in z.index:
    if(idx in y.index):
        z[idx] = (1-np.log(y[idx]-z[idx]+1e-8))*(z[idx]+y[idx])
    else:
        z[idx] = (1-np.log(z[idx]+1e-8))*z[idx]
z = z.sort_values(ascending=False)
z.head(20)       

In [None]:
arr = []
for i in range(20):
    idx = z.index[i]
    print(df_train[df_train['var_0']==idx]['target'].mean())
    #arr.append(df_train[df_train['var_0']==idx]['target'].mean())

print(np.mean(arr))

In [None]:
arr = []
for i in range(20):
    idx = x.index[len(x.index)-i-1]
    #print(df_train[df_train['var_126']==idx]['target'].mean())
    arr.append(df_train[df_train['var_0']==idx]['target'].mean())
print(np.mean(arr))

In [None]:
arr = []
while (len(arr)<=20):
    idx = y.index[len(y.index)-i-1]
    if(idx in x.index):
        #print(df_train[df_train['var_126']==idx]['target'].mean())
        arr.append(df_train[df_train['var_0']==idx]['target'].mean())
    i += 1
print(np.mean(arr))

In [None]:
for idx in x.index:
    if (idx>13.5538)and(idx<13.5553):
        print(idx, x[idx])

In [None]:
print(x.head(20))

In [None]:
print(y.head(20))

In [None]:
print(z.head(20))

In [None]:
print(y[13.9842])

In [None]:
print(df_train[df_train['var_12']==14.1713]['target'].mean())

In [None]:
print(df_train[df_train['var_12']==13.5550]['target'].mean())

In [None]:
print(df_train[df_train['var_12']==13.5545]['target'].mean())

In [None]:
print(df_train[df_train['var_12']==14.0810]['target'].mean())

In [None]:
print(df_train[df_train['var_12']==14.0616]['target'].mean())

In [None]:
for i in test_true['var_126'].value_counts().index:
    if i not in train_new['var_126'].value_counts().index:
        print(i)

In [None]:
new_12 = train_new.groupby(['var_12'])['target'].agg(['count','mean']).sort_values('count', ascending=False)
print(new_12)

In [None]:
plot_dis('var_108')

In [None]:
plot_dis('noise_var_108')

In [None]:
new_108 = train_new.groupby(['var_108'])['target'].agg(['count','mean']).sort_values('count', ascending=False)
print(new_108)