In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import numpy as np
import pandas as pd

import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as gobj
py.init_notebook_mode(connected=True)
#py.init_notebook_mode()

In [None]:
SHARED_FOLDER = '../ressources/us_census_full'

In [None]:
with open ('{}/train.csv'.format(SHARED_FOLDER), 'rb') as f:
    train_df = pd.read_csv(f)

In [None]:
train_df.columns

In [None]:
train_df['INCOMEBIN']=train_df['INCOME'].map({train_df['INCOME'].iloc[1]:0, train_df['INCOME'].iloc[56]:1}).astype(int)

# Test Part

In [None]:
train=train_df[['AHSCOL','INCOMEBIN']]
count = train.groupby(['AHSCOL','INCOMEBIN']).size().reset_index()
count1 = count[count.INCOMEBIN == 1]
count1.columns = ['AHSCOL','INCOMEBIN','count_1']
values = pd.DataFrame(train_df['AHSCOL'].value_counts())
values.columns = ['sum']
values['AHSCOL'] = [ str(i) for i in values.index ]
values_count = pd.merge(values, count1[['AHSCOL','count_1']], how = 'left', on=['AHSCOL'])
values_count = values_count.reindex_axis(['AHSCOL','sum','count_1'],axis=1)
values_count.reset_index(drop=True,inplace=True)
values_count['count_0'] = values_count['sum'] - values_count['count_1']
values_count['percent'] = values_count['count_1'].div(values_count['sum']).multiply(100).round(2)
values_count

In [None]:
values = values_count[['ACLSWKR','count_1','count_0','percent']]
values2 = values_count[['ACLSWKR','count_1','percent']]
trace1 = gobj.Bar(
    x= values['ACLSWKR'],
    y= values['count_1'],
    name='above 50000'
)
trace2 = gobj.Bar(
    x= values['ACLSWKR'],
    y= values['count_0'],
    name='below 50000'
)

data = [trace1, trace2]
annotations = [ dict( x = xi,
                      y = yi,
                      showarrow = False,
                      text = "{}%".format(pi),
                      xanchor = 'center',
                      yanchor = 'center',
                      font = {'color':'black'})
                 for xi,yi,pi in values2.values ]
layout = gobj.Layout(
    barmode='stack',
    title = 'ACLSWKR',
    annotations = annotations
)

fig = gobj.Figure(data=data, layout=layout)
py.iplot(fig)

# Plot categoric variables

In [None]:
categorized_cols = ['ACLSWKR', 'ADTIND', 'ADTOCC', 'AHGA', 'AHSCOL', 'AMARITL', 'AMJIND', 'AMJOCC',
 'ARACE', 'AREORGN', 'ASEX', 'AUNMEM', 'AUNTYPE', 'AWKSTAT', 'FILESTAT', 'GRINREG', 'GRINST', 'HHDFMX',
  'HHDREL', 'MIGMTR1', 'MIGMTR3', 'MIGMTR4', 'MIGSAME', 'MIGSUN', 'PARENT', 'PEFNTVTY', 'PEMNTVTY', 'PENATVTY',
   'PRCITSHP', 'SEOTR', 'VETQVA', 'VETYN', 'YEAR', 'INCOME']

In [None]:
def plot_categoric(col):
    values_count = pd.DataFrame(train_df[col].value_counts())
    values_count.columns = ['count']
    values_count[col] = [ str(i) for i in values_count.index ]
    values_count['percent'] = values_count['count'].div(values_count['count'].sum()).multiply(100).round(2)
    values_count = values_count.reindex_axis([col,'count','percent'],axis=1)
    values_count.reset_index(drop=True,inplace=True)
    font_size = 20 - (.25 * len(values_count[col]))
    trace = gobj.Bar( x = values_count[col], y = values_count['count'] )
    data_ = gobj.Data([trace])
    annotations0 = [dict(x = xi,
                         y = yi, 
                         showarrow=False,
                         font={'size':font_size},
                         text = "{:,}".format(yi),
                         xanchor='center',
                         yanchor='bottom' )
                   for xi,yi,_ in values_count.values ]
    annotations1 = [ dict( x = xi,
                          y = yi/2,
                          showarrow = False,
                          text = "{}%".format(pi),
                          xanchor = 'center',
                          yanchor = 'center',
                          font = {'color':'red'})
                     for xi,yi,pi in values_count.values if pi > 10 ]
    annotations = annotations0 + annotations1                       
    layout = gobj.Layout(title = col,
                         titlefont = {'size': 50},
                         yaxis = {'title':'count'},
                         xaxis = {'type':'category'},
                        annotations = annotations )
    figure = gobj.Figure(data = data_, layout = layout)
    
    return figure

In [None]:
for col in categorized_cols:
    figure = plot_categoric(col)
    py.iplot(figure)

In [None]:
def plot_categoric_comp(col):
    train=train_df[[col,'INCOMEBIN']]
    count = train.groupby([col,'INCOMEBIN']).size().reset_index()
    count1 = count[count.INCOMEBIN == 1]
    count1.columns = [col,'INCOMEBIN','count_1']
    values = pd.DataFrame(train_df[col].value_counts())
    values.columns = ['sum']
    values[col] = [ str(i) for i in values.index ]
    values_count = pd.merge(values, count1[[col,'count_1']], how = 'left', on=[col])
    values_count = values_count.reindex_axis([col,'sum','count_1'],axis=1)
    values_count.reset_index(drop=True,inplace=True)
    values_count['count_0'] = values_count['sum'] - values_count['count_1']
    values_count['percent'] = values_count['count_1'].div(values_count['sum']).multiply(100).round(2)
    
    values = values_count[[col,'count_1','count_0','percent']]
    values2 = values_count[[col,'count_1','percent']]
    trace1 = gobj.Bar(
        x= values[col],
        y= values['count_1'],
        name='above 50000'
    )
    trace2 = gobj.Bar(
        x= values[col],
        y= values['count_0'],
        name='below 50000'
    )

    data = [trace1, trace2]
    annotations = [ dict( x = xi,
                          y = yi,
                          showarrow = False,
                          text = "{}%".format(pi),
                          xanchor = 'center',
                          yanchor = 'center',
                          font = {'color':'black'})
                     for xi,yi,pi in values2.values ]
    layout = gobj.Layout(
        barmode='stack',
        title = col,
        annotations = annotations
    )

    figure = gobj.Figure(data=data, layout=layout)
    
    return figure

In [None]:
for col in categorized_cols:
    figure = plot_categoric_comp(col)
    py.iplot(figure)

In [None]:
#train_df['GRINREG']
series = train_df['GRINREG']
series = series[ series == ' ?'  ]
count = series.size
#smin,smax = series.min(),series.max()
#series2 = series[ ( ( series - series.mean() ).abs() > 3 * series.std() ) ]
#count_outlier = series2.size
count

# Test for discrete variable

In [None]:
#disc_cols = [ 'AAGE', 'AHRSPAY', 'CAPGAIN', 'CAPLOSS', 'DIVVAL',
#             'NOEMP', 'WKSWORK']

In [None]:
series = train_df['AHRSPAY']
columns = ['count','sum']
index = ['zero','outlier']
df = pd.DataFrame(index = index, columns = columns)
df['count'].loc['zero'] = series[ series == 0 ].size
series = series[ series != 0 ]
df['count'].loc['outlier'] = series[ ( ( series - series.mean() ).abs() > 3 * series.std() ) ].size
df['sum'] = train_df['AHRSPAY'].count()
df['percent'] = df['count'].div(df['sum']).multiply(100)
df['type'] = [ str(i) for i in df.index ]
df = df.reindex_axis(['type','count','percent'],axis=1)
df.reset_index(drop=True,inplace=True)
df

In [None]:
font_size = 20 - (.25 * len(df['type']))
trace = gobj.Bar( x = df['type'], y = df['count'] )
data_ = gobj.Data([trace])
annotations0 = [dict(x = xi,
                     y = yi, 
                     showarrow=False,
                     font={'size':font_size},
                     text = "{:,}".format(yi),
                     xanchor='center',
                     yanchor='bottom' )
               for xi,yi,_ in df.values ]
annotations1 = [ dict( x = xi,
                      y = yi/2,
                      showarrow = False,
                      text = "{}%".format(pi),
                      xanchor = 'center',
                      yanchor = 'center',
                      font = {'color':'red'})
                 for xi,yi,pi in df.values  ]
annotations = annotations0 + annotations1                       
layout = gobj.Layout(title = col,
                     titlefont = {'size': 50},
                     yaxis = {'title':'count'},
                     xaxis = {'type':'category'},
                    annotations = annotations )
figure = gobj.Figure(data = data_, layout = layout)
py.iplot(figure)

# Plot the discrete variable

In [None]:
disc_cols = [ 'AAGE', 'AHRSPAY', 'CAPGAIN', 'CAPLOSS', 'DIVVAL',
             'NOEMP', 'WKSWORK']

In [None]:
def plot_disc_normal(col):
    series = train_df[col]
    series = series[ series != 0 ]
    smin,smax = series.min(),series.max()
    # remove outliers for +- three standard deviations.
    series = series[ ~( ( series - series.mean() ).abs() > 3 * series.std() ) ]
    percentiles = [ np.percentile(series,n) for n in (2.5,50,97.5) ]
    
    trace0 = gobj.Histogram( x = series,
                            histfunc = 'avg', 
                            histnorm = 'probability density',
                            opacity=.75,
                           marker = {'color':'#EB89B5'})
    data_ = gobj.Data( [trace0] )
    
    shapes = [{ 'line': { 'color': '#0099FF', 'dash':'solid', 'width':2 },
                'type':'line',
                'x0':percentiles[0], 'x1':percentiles[0], 'xref':'x',
                'y0':-0.1, 'y1':1, 'yref':'paper' },
               
              { 'line': { 'color': '#00999F', 'dash':'solid', 'width':1 },
                'type':'line',
                'x0':percentiles[1], 'x1':percentiles[1], 'xref':'x',
                'y0':-0.1, 'y1':1, 'yref':'paper' },
    
              { 'line': { 'color': '#0099FF', 'dash':'solid', 'width':2 },
                'type':'line',
                'x0':percentiles[2], 'x1':percentiles[2], 'xref':'x',
                'y0':-0.1, 'y1':1, 'yref':'paper' } 
             ]
    
    annotations = [ {'x': percentiles[0], 'xref':'x','xanchor':'right',
                     'y': .3, 'yref':'paper', 
                     'text':'2.5%', 'font':{'size':16},
                     'showarrow':False},
                   
                    {'x': percentiles[1], 'xref':'x','xanchor':'center',
                     'y': .2, 'yref':'paper', 
                     'text':'95%<br>median = {0:,.2f}<br>mean = {1:,.2f}<br>min = {2:,}<br>max = {3:,}'
                         .format(percentiles[1],series.mean(),smin,smax), 
                     'showarrow':False,
                     'font':{'size':20} },
                   
                    {'x': percentiles[2], 'xref':'x','xanchor':'left',
                     'y': .3, 'yref':'paper', 
                     'text':'2.5%','font':{'size':16}, 
                     'showarrow':False},
                   
                    {'x': .5, 'xref':'paper','xanchor':'center',
                     'y': 1.1, 'yref':'paper','yanchor':'center', 
                     'text':'Outliers above or below three standard deviations are excluded from the graph, mean and median calculations.',
                     'font':{'size':15,'color':'rose'}, 
                     'showarrow':False} 
                  ]
    
    layout = gobj.Layout( title = col,
                        titlefont = {'size':'50'},
                        yaxis = {'title':'Probability/Density'},
                        xaxis = {'title':col, 'type':'discrete'},
                        shapes = shapes,
                         annotations = annotations
                         )
    figure = gobj.Figure(data = data_, layout = layout)
    return figure

In [None]:
for col in disc_cols:
    figure = plot_disc_normal(col)
    py.iplot(figure)

In [None]:
def plot_disc_outlier(col):
    series = train_df[col]
    columns = ['count','sum']
    index = ['zero','outlier']
    df = pd.DataFrame(index = index, columns = columns)
    df['count'].loc['zero'] = series[ series == 0 ].size
    series = series[ series != 0 ]
    df['count'].loc['outlier'] = series[ ( ( series - series.mean() ).abs() > 3 * series.std() ) ].size
    df['sum'] = train_df[col].count()
    df['percent'] = df['count'].div(df['sum']).multiply(100)
    df['type'] = [ str(i) for i in df.index ]
    df = df.reindex_axis(['type','count','percent'],axis=1)
    df.reset_index(drop=True,inplace=True)
    
    font_size = 20 - (.25 * len(df['type']))
    trace = gobj.Bar( x = df['type'], y = df['count'] )
    data_ = gobj.Data([trace])
    annotations0 = [dict(x = xi,
                         y = yi, 
                         showarrow=False,
                         font={'size':font_size},
                         text = "{:,}".format(yi),
                         xanchor='center',
                         yanchor='bottom' )
                   for xi,yi,_ in df.values ]
    annotations1 = [ dict( x = xi,
                          y = yi/2,
                          showarrow = False,
                          text = "{}%".format(pi),
                          xanchor = 'center',
                          yanchor = 'center',
                          font = {'color':'red'})
                     for xi,yi,pi in df.values  ]
    annotations = annotations0 + annotations1                       
    layout = gobj.Layout(title = col,
                         titlefont = {'size': 50},
                         yaxis = {'title':'count'},
                         xaxis = {'type':'category'},
                        annotations = annotations )
    figure = gobj.Figure(data = data_, layout = layout)
    return figure

In [None]:
for col in disc_cols:
    figure = plot_disc_outlier(col)
    py.iplot(figure)