In [117]:
#%%
import numpy as np
import pandas as pd
import fastparquet
import umap
import holoviews as hv
hv.extension('bokeh')

from code.dfmaxreducer import Maxreducer
reducer = Maxreducer()


#%% start with previous apps file
hc_prevs = pd.read_csv('./input/raw/previous_application.csv', index_col='SK_ID_PREV')
hc_prevs.sort_index(inplace=True)

hc_prevs.dtypes
hc_prevs.isnull().sum(axis=0)
hc_prevs.shape


#%% check unique values
nunqs = hc_prevs.nunique()
# unqs = hc_prevs.loc[:, nunqs < 10]
unqs = hc_prevs.select_dtypes(include='object')

collist = []
unqlist = []
for col in unqs:
    collist.append(col)
    unqlist.append(unqs[col].unique())
unq_df = pd.DataFrame({'feature':collist, 'values':unqlist,
    'types':unqs.nunique().values})
# unq_df.to_csv('./categories.csv', index=False)

unq_df


#%% treat nas and infs

# hc_prevs.replace(365243, np.NaN, inplace=True)
# hc_prevs.SELLERPLACE_AREA.replace(4000000, np.NaN, inplace=True)

# drop the bad rows
hc_prevs.dropna(subset=['SK_ID_CURR'], inplace=True)

# count missings
hc_prevs['nulls'] = hc_prevs.isnull().sum(axis=1)

# make flag for isnulls
hc_prevs['AAFlag'] = np.where(hc_prevs.AMT_ANNUITY.isnull(), 1, 0)
hc_prevs['ADPFlag'] = np.where(hc_prevs.AMT_DOWN_PAYMENT.isnull(), 1, 0)
hc_prevs['AGPFlag'] = np.where(hc_prevs.AMT_GOODS_PRICE.isnull(), 1, 0)
hc_prevs['RIPFlag'] = np.where(hc_prevs.RATE_INTEREST_PRIMARY.isnull(), 1, 0)
hc_prevs['DFDFlag'] = np.where(hc_prevs.DAYS_FIRST_DRAWING.isnull(), 1, 0)

# replace specific values
hc_prevs['AMT_CREDIT'].fillna(value=hc_prevs['AMT_APPLICATION'], inplace=True)
fillvalues = {
              'RATE_DOWN_PAYMENT': 2,
              'RATE_INTEREST_PRIMARY': 2,
              'RATE_INTEREST_PRIVILEGED': 2,
              'PRODUCT_COMBINATION': 'XXX',
              'NAME_TYPE_SUITE': 'XXX',
              'NFLAG_INSURED_ON_APPROVAL': 5
              } 
hc_prevs.fillna(value=fillvalues, inplace=True)
hc_prevs.fillna(0, inplace=True)



#%% check work
hc_prevs.head()
hc_prevs.isnull().sum(axis=0)
hc_prevs.dtypes


#%% convert types and reduce
hc_prevs = reducer.reduce(hc_prevs, verbose=True)

cat_cols = hc_prevs.select_dtypes(['category']).columns
hc_prevs[cat_cols] = hc_prevs[cat_cols].apply(lambda x: x.cat.codes)


convert SK_ID_CURR to <class 'numpy.uint32'>
convert AMT_ANNUITY to <class 'numpy.float32'>
convert AMT_APPLICATION to <class 'numpy.float32'>
convert NAME_CONTRACT_TYPE to category
convert AMT_CREDIT to <class 'numpy.float32'>
convert AMT_DOWN_PAYMENT to <class 'numpy.float32'>
convert AMT_GOODS_PRICE to <class 'numpy.float32'>
convert HOUR_APPR_PROCESS_START to <class 'numpy.uint8'>
convert WEEKDAY_APPR_PROCESS_START to category
convert NFLAG_LAST_APPL_IN_DAY to <class 'numpy.uint8'>
convert FLAG_LAST_APPL_PER_CONTRACT to category
convert RATE_DOWN_PAYMENT to <class 'numpy.float16'>
convert RATE_INTEREST_PRIMARY to <class 'numpy.float16'>
convert RATE_INTEREST_PRIVILEGED to <class 'numpy.float16'>
convert NAME_CASH_LOAN_PURPOSE to category
convert NAME_CONTRACT_STATUS to category
convert DAYS_DECISION to <class 'numpy.int16'>
convert NAME_PAYMENT_TYPE to category
convert CODE_REJECT_REASON to category
convert NAME_TYPE_SUITE to category
convert NAME_CLIENT_TYPE to category
convert NA

In [118]:
apps = pd.read_csv('./input/raw/application_train.csv', usecols = ['SK_ID_CURR', 'TARGET'],
     index_col='SK_ID_CURR')
apps.sort_index(inplace=True)

In [119]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

tfr = MinMaxScaler()

#%% umap it
print(hc_prevs.shape)
hc_samp = hc_prevs.sample(frac=0.005, random_state = 500)
hc_array = hc_samp.drop('SK_ID_CURR', axis=1).values
hc_array = tfr.fit_transform(hc_array)
print(hc_array.shape)

ump = umap.UMAP(n_neighbors=30, verbose=True)
embs = ump.fit_transform(hc_array)


(1670214, 42)
(8351, 41)
UMAP(n_neighbors=30, n_components=2, metric='euclidean',  gamma=1.0, n_epochs=None, alpha=1.0, init='spectral', spread=1.0, min_dist=0.1, a=None, b=None, random_state=None, metric_kwds={}, verbose=True)
Construct fuzzy simplicial set
	 0  /  13
	 1  /  13
	 2  /  13
Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs


In [125]:


# add colors

# plot = hc_samp.merge(apps, how='left', left_on='SK_ID_CURR', right_index=True)
plot = pd.merge(hc_samp, apps, how='left', on='SK_ID_CURR')

plot['dimx'] = embs[:, 0]
plot['dimy'] = embs[:, 1]


In [126]:
plot.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,...,NFLAG_INSURED_ON_APPROVAL,nulls,AAFlag,ADPFlag,AGPFlag,RIPFlag,DFDFlag,TARGET,dimx,dimy
0,211273,1,5300.504883,43276.5,27252.0,17311.5,43276.5,4,5,1,...,1.0,2,0,0,0,1,0,0.0,5.456411,-6.960082
1,231251,1,4454.325195,21145.5,22194.0,0.0,21145.5,6,13,1,...,1.0,2,0,0,0,1,0,0.0,10.23764,-3.605954
2,337107,1,6008.535156,27355.5,32094.0,0.0,27355.5,0,13,1,...,0.0,2,0,0,0,1,0,0.0,12.7904,-2.622823
3,162070,1,26716.095703,240556.5,253786.5,0.0,240556.5,6,5,1,...,5.0,9,0,0,0,1,1,0.0,-1.340197,14.572921
4,148747,0,33634.035156,990000.0,1104997.5,0.0,990000.0,1,13,1,...,5.0,10,0,1,0,1,1,0.0,-15.165242,6.734215


In [127]:

%%opts Scatter [width=700 height=500]
%%opts Scatter (color=Cycle('Category20'), size=5)


# df = 'plot'
plot.fillna(2, inplace=True)
plot = plot[plot['TARGET'] != 2]
key_dimensions   = [('dimx', 'X'), ('TARGET', 'default')]
value_dimensions = [('dimy', 'Y')]
coords = hv.Table(plot, key_dimensions, value_dimensions)


emb_scatter = coords.to.scatter('X', 'Y')
emb_scatter.overlay('default')



In [18]:
hv.help(hv.Scatter)

Scatter

Online example: http://holoviews.org/reference/elements/bokeh/Scatter.html

[1;35m-------------
Style Options
-------------[0m

	alpha, cmap, color, fill_alpha, fill_color, hover_alpha, hover_color, hover_fill_alpha, hover_fill_color, hover_line_alpha, hover_line_color, line_alpha, line_cap, line_color, line_dash, line_join, line_width, marker, muted_alpha, muted_color, muted_fill_alpha, muted_fill_color, muted_line_alpha, muted_line_color, nonselection_alpha, nonselection_color, nonselection_fill_alpha, nonselection_fill_color, nonselection_line_alpha, nonselection_line_color, palette, selection_alpha, selection_color, selection_fill_alpha, selection_fill_color, selection_line_alpha, selection_line_color, size

(Consult bokeh's documentation for more information.)

[1;35m------------
Plot Options
------------[0m

The plot options are the parameters of the plotting class:

[1;32mParameters of 'PointPlot'
[0m
[1;31mParameters changed from their default values are marked 