# Step1 : Data Analysis --- To konw some basic information of dataset

In [1]:
import numpy as np
import pandas as pd

import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as gobj
py.init_notebook_mode(connected=True)

In [2]:
SHARED_FOLDER = '../ressources/us_census_full'

In [3]:
with open ('{}/train.csv'.format(SHARED_FOLDER), 'rb') as f:
    train_df = pd.read_csv(f)

In [6]:
train_df.columns
#train_df.describe()

array([' Self-employed-not incorporated', ' Not in universe', ' Private',
       ' Local government', ' Federal government',
       ' Self-employed-incorporated', ' State government', ' Never worked',
       ' Without pay'], dtype=object)

In [7]:
train_df['INCOMEBIN']=train_df['INCOME'].map({train_df['INCOME'].iloc[1]:0, train_df['INCOME'].iloc[56]:1}).astype(int)

# Plot categoric variables

In [8]:
categorized_cols = ['ACLSWKR', 'ADTIND', 'ADTOCC', 'AHGA', 'AHSCOL', 'AMARITL', 'AMJIND', 'AMJOCC',
 'ARACE', 'AREORGN', 'ASEX', 'AUNMEM', 'AUNTYPE', 'AWKSTAT', 'FILESTAT', 'GRINREG', 'GRINST', 'HHDFMX',
  'HHDREL', 'MIGMTR1', 'MIGMTR3', 'MIGMTR4', 'MIGSAME', 'MIGSUN', 'PARENT', 'PEFNTVTY', 'PEMNTVTY', 'PENATVTY',
   'PRCITSHP', 'SEOTR', 'VETQVA', 'VETYN', 'YEAR', 'INCOME']

### At the first time, we explore the categoric variables with ploting the distribution of different value for each variable. We can easily find the percent of every value existed in the whole dataset. In fact, the missing data is presneted by "?" ,  we can also get their percents from graphs. And we also know that the missing data exist only in the follow variables : So we found missing data in GRINST, MIGMTR1, MIGMTR3, MIGMTR4, MIGSUN, PEFNTVTY, PEMNTVTY, PENATVTY. For example, the missing rate of MIGMTR1 is 49.97% which is a little large for the dataset.

In [9]:
def plot_categoric(col):
    values_count = pd.DataFrame(train_df[col].value_counts())
    values_count.columns = ['count']
    values_count[col] = [ str(i) for i in values_count.index ]
    values_count['percent'] = values_count['count'].div(values_count['count'].sum()).multiply(100).round(2)
    values_count = values_count.reindex_axis([col,'count','percent'],axis=1)
    values_count.reset_index(drop=True,inplace=True)
    font_size = 20 - (.25 * len(values_count[col]))
    trace = gobj.Bar( x = values_count[col], y = values_count['count'] )
    data_ = gobj.Data([trace])
    annotations0 = [dict(x = xi,
                         y = yi, 
                         showarrow=False,
                         font={'size':font_size},
                         text = "{:,}".format(yi),
                         xanchor='center',
                         yanchor='bottom' )
                   for xi,yi,_ in values_count.values ]
    annotations1 = [ dict( x = xi,
                          y = yi/2,
                          showarrow = False,
                          text = "{}%".format(pi),
                          xanchor = 'center',
                          yanchor = 'center',
                          font = {'color':'red'})
                     for xi,yi,pi in values_count.values if pi > 10 ]
    annotations = annotations0 + annotations1                       
    layout = gobj.Layout(title = col,
                         titlefont = {'size': 50},
                         yaxis = {'title':'count'},
                         xaxis = {'type':'category'},
                        annotations = annotations )
    figure = gobj.Figure(data = data_, layout = layout)
    
    return figure

In [10]:
for col in categorized_cols:
    figure = plot_categoric(col)
    py.iplot(figure)

### And I also want to know the relationship between  income level and the variable value. For example, we can find that 34.73% of the self-employed-incorporated are in the income level of 50000+ . The private sector, which has the most people work in, has the largest number of population that earn more than 50K per year. However, in terms of the proportion, the self-employed people are the winner.

In [11]:
def plot_categoric_comp(col):
    train=train_df[[col,'INCOMEBIN']]
    count = train.groupby([col,'INCOMEBIN']).size().reset_index()
    count1 = count[count.INCOMEBIN == 1]
    count1.columns = [col,'INCOMEBIN','count_1']
    values = pd.DataFrame(train_df[col].value_counts())
    values.columns = ['sum']
    values[col] = [ str(i) for i in values.index ]
    values_count = pd.merge(values, count1[[col,'count_1']], how = 'left', on=[col])
    values_count = values_count.reindex_axis([col,'sum','count_1'],axis=1)
    values_count.reset_index(drop=True,inplace=True)
    values_count['count_0'] = values_count['sum'] - values_count['count_1']
    values_count['percent'] = values_count['count_1'].div(values_count['sum']).multiply(100).round(2)
    
    values = values_count[[col,'count_1','count_0','percent']]
    values2 = values_count[[col,'count_1','percent']]
    trace1 = gobj.Bar(
        x= values[col],
        y= values['count_1'],
        name='above 50000'
    )
    trace2 = gobj.Bar(
        x= values[col],
        y= values['count_0'],
        name='below 50000'
    )

    data = [trace1, trace2]
    annotations = [ dict( x = xi,
                          y = yi,
                          showarrow = False,
                          text = "{}%".format(pi),
                          xanchor = 'center',
                          yanchor = 'center',
                          font = {'color':'black'})
                     for xi,yi,pi in values2.values ]
    layout = gobj.Layout(
        barmode='stack',
        title = col,
        annotations = annotations
    )

    figure = gobj.Figure(data=data, layout=layout)
    
    return figure

In [13]:
categorized_cols = ['ACLSWKR','AHGA', 'AHSCOL', 'AMARITL', 'AMJIND', 'AMJOCC',
 'ARACE', 'AREORGN', 'ASEX', 'AUNMEM', 'AUNTYPE', 'AWKSTAT', 'FILESTAT', 'GRINREG', 'GRINST', 'HHDFMX',
  'HHDREL', 'MIGMTR1', 'MIGMTR3', 'MIGMTR4', 'MIGSAME', 'MIGSUN', 'PARENT', 'PEFNTVTY', 'PEMNTVTY', 'PENATVTY',
   'PRCITSHP', 'VETQVA']
for col in categorized_cols:
    figure = plot_categoric_comp(col)
    py.iplot(figure)

# Plot the discrete variable

### In this part, I want to plot two different graphs, one for normal data(excluded zero and outlier) and the other for zero and outliers.

In [14]:
disc_cols = [ 'AAGE', 'AHRSPAY', 'CAPGAIN', 'CAPLOSS', 'DIVVAL',
             'NOEMP', 'WKSWORK']

In [15]:
def plot_disc_normal(col):
    series = train_df[col]
    series = series[ series != 0 ]
    smin,smax = series.min(),series.max()
    # remove outliers for +- three standard deviations.
    series = series[ ~( ( series - series.mean() ).abs() > 3 * series.std() ) ]
    percentiles = [ np.percentile(series,n) for n in (2.5,50,97.5) ]
    
    trace0 = gobj.Histogram( x = series,
                            histfunc = 'avg', 
                            histnorm = 'probability density',
                            opacity=.75,
                           marker = {'color':'#EB89B5'})
    data_ = gobj.Data( [trace0] )
    
    shapes = [{ 'line': { 'color': '#0099FF', 'dash':'solid', 'width':2 },
                'type':'line',
                'x0':percentiles[0], 'x1':percentiles[0], 'xref':'x',
                'y0':-0.1, 'y1':1, 'yref':'paper' },
               
              { 'line': { 'color': '#00999F', 'dash':'solid', 'width':1 },
                'type':'line',
                'x0':percentiles[1], 'x1':percentiles[1], 'xref':'x',
                'y0':-0.1, 'y1':1, 'yref':'paper' },
    
              { 'line': { 'color': '#0099FF', 'dash':'solid', 'width':2 },
                'type':'line',
                'x0':percentiles[2], 'x1':percentiles[2], 'xref':'x',
                'y0':-0.1, 'y1':1, 'yref':'paper' } 
             ]
    
    annotations = [ {'x': percentiles[0], 'xref':'x','xanchor':'right',
                     'y': .3, 'yref':'paper', 
                     'text':'2.5%', 'font':{'size':16},
                     'showarrow':False},
                   
                    {'x': percentiles[1], 'xref':'x','xanchor':'center',
                     'y': .2, 'yref':'paper', 
                     'text':'95%<br>median = {0:,.2f}<br>mean = {1:,.2f}<br>min = {2:,}<br>max = {3:,}'
                         .format(percentiles[1],series.mean(),smin,smax), 
                     'showarrow':False,
                     'font':{'size':20} },
                   
                    {'x': percentiles[2], 'xref':'x','xanchor':'left',
                     'y': .3, 'yref':'paper', 
                     'text':'2.5%','font':{'size':16}, 
                     'showarrow':False},
                   
                    {'x': .5, 'xref':'paper','xanchor':'center',
                     'y': 1.1, 'yref':'paper','yanchor':'center', 
                     'text':'Outliers above or below three standard deviations are excluded from the graph, mean and median calculations.',
                     'font':{'size':15,'color':'rose'}, 
                     'showarrow':False} 
                  ]
    
    layout = gobj.Layout( title = col,
                        titlefont = {'size':'50'},
                        yaxis = {'title':'Probability/Density'},
                        xaxis = {'title':col, 'type':'discrete'},
                        shapes = shapes,
                         annotations = annotations
                         )
    figure = gobj.Figure(data = data_, layout = layout)
    return figure

In [16]:
for col in disc_cols:
    figure = plot_disc_normal(col)
    py.iplot(figure)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [17]:
def plot_disc_outlier(col):
    series = train_df[col]
    columns = ['count','sum']
    index = ['zero','outlier']
    df = pd.DataFrame(index = index, columns = columns)
    df['count'].loc['zero'] = series[ series == 0 ].size
    series = series[ series != 0 ]
    df['count'].loc['outlier'] = series[ ( ( series - series.mean() ).abs() > 3 * series.std() ) ].size
    df['sum'] = train_df[col].count()
    df['percent'] = df['count'].div(df['sum']).multiply(100)
    df['type'] = [ str(i) for i in df.index ]
    df = df.reindex_axis(['type','count','percent'],axis=1)
    df.reset_index(drop=True,inplace=True)
    
    font_size = 20 - (.25 * len(df['type']))
    trace = gobj.Bar( x = df['type'], y = df['count'] )
    data_ = gobj.Data([trace])
    annotations0 = [dict(x = xi,
                         y = yi, 
                         showarrow=False,
                         font={'size':font_size},
                         text = "{:,}".format(yi),
                         xanchor='center',
                         yanchor='bottom' )
                   for xi,yi,_ in df.values ]
    annotations1 = [ dict( x = xi,
                          y = yi/2,
                          showarrow = False,
                          text = "{}%".format(pi),
                          xanchor = 'center',
                          yanchor = 'center',
                          font = {'color':'red'})
                     for xi,yi,pi in df.values  ]
    annotations = annotations0 + annotations1                       
    layout = gobj.Layout(title = col,
                         titlefont = {'size': 50},
                         yaxis = {'title':'count'},
                         xaxis = {'type':'category'},
                        annotations = annotations )
    figure = gobj.Figure(data = data_, layout = layout)
    return figure

In [18]:
for col in disc_cols:
    figure = plot_disc_outlier(col)
    py.iplot(figure)