In [1]:
#Load the librarys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams

import plotly.tools as tls
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import warnings

from scipy import stats

%matplotlib inline

In [3]:
df = pd.read_csv('../input/us-acs-mortgage-equity-loans-rent-statistics/real_estate_db.csv', encoding='ISO-8859-1' )

In [4]:
df.head()

Unnamed: 0,UID,BLOCKID,SUMLEVEL,COUNTYID,STATEID,state,state_ab,city,place,type,...,female_age_mean,female_age_median,female_age_stdev,female_age_sample_weight,female_age_samples,pct_own,married,married_snp,separated,divorced
0,220336,,140,16,2,Alaska,AK,Unalaska,Unalaska City,City,...,32.78177,31.91667,19.31875,440.46429,1894.0,0.25053,0.47388,0.30134,0.03443,0.09802
1,220342,,140,20,2,Alaska,AK,Eagle River,Anchorage,City,...,38.97956,39.66667,20.05513,466.65478,1947.0,0.94989,0.52381,0.01777,0.00782,0.13575
2,220343,,140,20,2,Alaska,AK,Jber,Anchorage,City,...,22.20427,23.16667,13.86575,887.67805,3570.0,0.00759,0.50459,0.06676,0.01,0.01838
3,220345,,140,20,2,Alaska,AK,Anchorage,Point Mackenzie,City,...,37.0075,34.0,22.06347,281.4942,1049.0,0.20247,0.44428,0.05933,0.0,0.21563
4,220347,,140,20,2,Alaska,AK,Anchorage,Anchorage,City,...,34.96611,31.75,20.49887,655.98066,2905.0,0.56936,0.51034,0.08315,0.06731,0.08711


In [5]:
del df['UID']
del df['BLOCKID']

In [6]:
df.head()

Unnamed: 0,SUMLEVEL,COUNTYID,STATEID,state,state_ab,city,place,type,primary,zip_code,...,female_age_mean,female_age_median,female_age_stdev,female_age_sample_weight,female_age_samples,pct_own,married,married_snp,separated,divorced
0,140,16,2,Alaska,AK,Unalaska,Unalaska City,City,tract,99685,...,32.78177,31.91667,19.31875,440.46429,1894.0,0.25053,0.47388,0.30134,0.03443,0.09802
1,140,20,2,Alaska,AK,Eagle River,Anchorage,City,tract,99577,...,38.97956,39.66667,20.05513,466.65478,1947.0,0.94989,0.52381,0.01777,0.00782,0.13575
2,140,20,2,Alaska,AK,Jber,Anchorage,City,tract,99505,...,22.20427,23.16667,13.86575,887.67805,3570.0,0.00759,0.50459,0.06676,0.01,0.01838
3,140,20,2,Alaska,AK,Anchorage,Point Mackenzie,City,tract,99501,...,37.0075,34.0,22.06347,281.4942,1049.0,0.20247,0.44428,0.05933,0.0,0.21563
4,140,20,2,Alaska,AK,Anchorage,Anchorage,City,tract,99504,...,34.96611,31.75,20.49887,655.98066,2905.0,0.56936,0.51034,0.08315,0.06731,0.08711


In [7]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

In [8]:
resumetable(df)[:43]

Dataset Shape: (39030, 78)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Third Value,Entropy
0,SUMLEVEL,int64,0,1,140,140,140,0.0
1,COUNTYID,int64,0,310,16,20,20,6.58
2,STATEID,int64,0,52,2,2,2,5.1
3,state,object,0,52,Alaska,Alaska,Alaska,5.1
4,state_ab,object,0,52,AK,AK,AK,5.1
5,city,object,0,8172,Unalaska,Eagle River,Jber,11.5
6,place,object,0,11856,Unalaska City,Anchorage,Anchorage,12.53
7,type,object,0,6,City,City,City,1.89
8,primary,object,0,1,tract,tract,tract,0.0
9,zip_code,int64,0,15098,99685,99577,99505,13.47


**Type Feature**


In [18]:
percentual_types = round(df["type"].value_counts(), 2)

types = round(df["type"].value_counts() / len(df["type"]) * 100,2)

labels = list(types.index)
values = list(types.values)


trace1 = go.Pie(labels=labels, values=values, marker=dict(colors=['red']),text = percentual_types.values, hole=0.3)

layout = go.Layout(title='Distribuition of Types', legend=dict(orientation="h"), annotations=[dict(text='Types', font_size=15, showarrow=False)]);

fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)

**Interactive**

In [19]:
state_count = df["state"].value_counts()
city_count = df.city.value_counts()
place_count = df.place.value_counts()
primary_count = df.primary.value_counts()

In [21]:
trace1 = go.Bar(x=state_count[:20].values[::-1],
                y=state_count[:20].index[::-1],
                orientation='h', visible=True,
                      name='Top 20 States',
                      marker=dict(
                          color=city_count[:20].values[::-1],
                          colorscale = 'Viridis',
                          reversescale = True
                      ))

trace2 = go.Bar(x=city_count[:20].values[::-1],
                      y=city_count[:20].index[::-1],
                      orientation = 'h', visible=False, 
                      name='TOP 20 Citys',
                      marker=dict(
                          color=city_count[:20].values[::-1],
                          colorscale = 'Viridis',
                          reversescale = True
                      ))

trace3 = go.Histogram(y=sorted(df['type'], reverse=True), histnorm='percent', orientation='h', visible=False, 
                      name='Type Count')

trace4 = go.Bar(x=place_count[:20].values[::-1],
                y=place_count[:20].index[::-1],
                orientation='h', visible=False, 
                name='Top 20 Place',
                marker=dict(
                    color=city_count[:20].values[::-1],
                    colorscale = 'Viridis',
                    reversescale = True
                      ))

data = [trace1, trace2, trace3, trace4]

updatemenus = list([
    dict(active=-1,
         x=-0.15,
         buttons=list([  
            dict(
                label = 'State Count',
                 method = 'update',
                 args = [{'visible': [True, False, False, False]}, 
                         {'title': 'TOP 20 State Count'}]),
             
             dict(
                  label = 'City Count',
                 visible=True,
                 method = 'update',
                 args = [{'visible': [False, True, False, False]},
                     {'title': 'TOP 20 City Count'}]),

            dict(
                 label = 'Type Count',
                 method = 'update',
                 args = [{'visible': [False, False, True, False]},
                     {'title': 'Type Counts'}]),

            dict(
                 label = 'Place Count',
                 method = 'update',
                 args = [{'visible': [False, False, False, True]},
                     {'title': ' Top 20 Place Count'}])
        ]),
    )
])


layout = dict(title='The count of the principal Categorical Features <br>(Select from Dropdown)', 
              showlegend=False,
              updatemenus=updatemenus)

fig = dict(data=data, layout=layout)

iplot(fig)

In [22]:
df['ALand_div_1M'] = np.log(df['ALand'] / 1000000)

In [25]:
trace1  = go.Box(
    x=df[df.city.isin(city_count[:15].index.values)]['city'],
    y=df[df.city.isin(city_count[:15].index.values)]['rent_median'], 
    showlegend=False, visible=True
)
                        
trace2  = go.Box(
    x=df[df.city.isin(city_count[:15].index.values)]['city'],
    y=df[df.city.isin(city_count[:15].index.values)]['family_median'], 
    showlegend=False, visible=False
)
                
trace3 = go.Box(
    x=df[df.city.isin(city_count[:15].index.values)]['city'],
    y=df[df.city.isin(city_count[:15].index.values)]['hi_median'],
    showlegend=False, visible=False
)

trace4 = go.Box(
    x=df[df.city.isin(city_count[:15].index.values)]['city'],
    y=df[df.city.isin(city_count[:15].index.values)]['hc_mortgage_mean'],
    showlegend=False, visible=False
)

data = [trace1, trace2, trace3, trace4]

updatemenus = list([
    dict(active=-1,
         x=-0.15,
         buttons=list([  
             
            dict(
                label = 'City Rent Boxplot',
                 method = 'update',
                 args = [{'visible': [True, False, False, False]}, 
                     {'title': 'TOP 15 Citys - Rent Median'}]),
             
             dict(
                  label = 'City Family Boxplot',
                 method = 'update',
                 args = [{'visible': [False, True, False, False]},
                     {'title': 'TOP 15 Citys - Family Income Median'}]),

            dict(
                 label = 'City House Inc',
                 method = 'update',
                 args = [{'visible': [False, False, True, False]},
                     {'title': 'TOP 15 Citys - House income Median'}]),

            dict(
                 label =  'City HC Mortage',
                 method = 'update',
                 args = [{'visible': [False, False, False, True]},
                     {'title': 'TOP 15 Citys - Home Cost Mortage'}])
        ]),
    )
])

layout = dict(title='Citys BoxPlots of Medians <br>(Select metrics from Dropdown)', 
              showlegend=False,
              updatemenus=updatemenus)

fig = dict(data=data, layout=layout)

iplot(fig, filename='dropdown')

**Some boxplots of City's**


In [26]:
trace1  = go.Box(
    x=df[df.city.isin(city_count[:15].index.values)]['city'],
    y=df[df.city.isin(city_count[:15].index.values)]['rent_median'], 
    showlegend=False, visible=True
)
                        
trace2  = go.Box(
    x=df[df.city.isin(city_count[:15].index.values)]['city'],
    y=df[df.city.isin(city_count[:15].index.values)]['family_median'], 
    showlegend=False, visible=False
)
                
trace3 = go.Box(
    x=df[df.city.isin(city_count[:15].index.values)]['city'],
    y=df[df.city.isin(city_count[:15].index.values)]['hi_median'],
    showlegend=False, visible=False
)

trace4 = go.Box(
    x=df[df.city.isin(city_count[:15].index.values)]['city'],
    y=df[df.city.isin(city_count[:15].index.values)]['hc_mortgage_mean'],
    showlegend=False, visible=False
)

data = [trace1, trace2, trace3, trace4]

updatemenus = list([
    dict(active=-1,
         x=-0.15,
         buttons=list([  
             
            dict(
                label = 'City Rent Boxplot',
                 method = 'update',
                 args = [{'visible': [True, False, False, False]}, 
                     {'title': 'TOP 15 Citys - Rent Median'}]),
             
             dict(
                  label = 'City Family Boxplot',
                 method = 'update',
                 args = [{'visible': [False, True, False, False]},
                     {'title': 'TOP 15 Citys - Family Income Median'}]),

            dict(
                 label = 'City House Inc',
                 method = 'update',
                 args = [{'visible': [False, False, True, False]},
                     {'title': 'TOP 15 Citys - House income Median'}]),

            dict(
                 label =  'City HC Mortage',
                 method = 'update',
                 args = [{'visible': [False, False, False, True]},
                     {'title': 'TOP 15 Citys - Home Cost Mortage'}])
        ]),
    )
])

layout = dict(title='Citys BoxPlots of Medians <br>(Select metrics from Dropdown)', 
              showlegend=False,
              updatemenus=updatemenus)

fig = dict(data=data, layout=layout)

iplot(fig, filename='dropdown')

**Another Approach to visualization of this same data**


In [27]:
city_count = df.city.value_counts()

#First plot
trace0 = go.Box(
    x=df[df.city.isin(city_count[:10].index.values)]['city'],
    y=df[df.city.isin(city_count[:10].index.values)]['rent_median'], 
    showlegend=False
)

#Second plot
trace1 = go.Box(
    x=df[df.city.isin(city_count[:10].index.values)]['city'],
    y=df[df.city.isin(city_count[:10].index.values)]['family_median'], 
    showlegend=False
)

#Second plot
trace2 = go.Box(
    x=df[df.city.isin(city_count[:10].index.values)]['city'],
    y=df[df.city.isin(city_count[:10].index.values)]['hc_mortgage_median'], 
    showlegend=False
)

#Third plot
trace3 = go.Histogram(
    x=df[df.city.isin(city_count[:20].index.values)]['city'], histnorm='percent',
    showlegend=False
)
#Third plot
trace4 = go.Histogram(
    x=np.log(df['family_median']).sample(5000), histnorm='percent', autobinx=True,
    showlegend=True, name='Family'
)

#Third plot
trace5 = go.Histogram(
    x=np.log(df['hc_mortgage_median']).sample(5000), histnorm='percent', autobinx=True,
    showlegend=True, name='HC mort'
)

#Third plot
trace6 = go.Histogram(
    x=np.log(df['rent_median']).sample(5000), histnorm='percent', autobinx=True,
    showlegend=True, name="Rent"
)

#Creating the grid
fig = tls.make_subplots(rows=2, cols=3, specs=[[{'colspan': 2}, None, {}], [{}, {}, {}]],
                          subplot_titles=("Citys Count",
                                          "Medians Distribuition", 
                                          "HC Morttage Median",
                                          "Family Median", 
                                          "Rent Median"))

#setting the figs
fig.append_trace(trace0, 2, 1)
fig.append_trace(trace1, 2, 3)
fig.append_trace(trace2, 2, 2)
fig.append_trace(trace3, 1, 1)
fig.append_trace(trace4, 1, 3)
fig.append_trace(trace5, 1, 3)
fig.append_trace(trace6, 1, 3)

fig['layout'].update(showlegend=True, title="Some Top Citys Distribuitions")

iplot(fig)


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



**Taking a look in box plots of some of this values**


In [28]:
#First plot
trace0 = go.Box(
    x=df[df.city.isin(city_count[:10].index.values)]['city'],
    y=df[df.city.isin(city_count[:10].index.values)]['rent_median'], 
    showlegend=False
)

#Second plot
trace1 = go.Box(
    x=df[df.city.isin(city_count[:10].index.values)]['city'],
    y=df[df.city.isin(city_count[:10].index.values)]['family_median'], 
    showlegend=False
)

#Second plot
trace2 = go.Box(
    x=df[df.city.isin(city_count[:10].index.values)]['city'],
    y=df[df.city.isin(city_count[:10].index.values)]['hc_mortgage_median'], 
    showlegend=False
)

#Third plot
trace3 = go.Histogram(
    x=df[df.city.isin(city_count[:20].index.values)]['city'], histnorm='percent',
    showlegend=False
)

#Creating the grid
fig = tls.make_subplots(rows=2, cols=3, specs=[[{'colspan': 3}, None, None], [{}, {}, {}]],
                          subplot_titles=("City Count",
                                          "Rent Median by City",
                                          "HC Morttage Median by City",
                                          "Family Median by City"
                                          ))
#setting the figs
fig.append_trace(trace0, 2, 1)
fig.append_trace(trace1, 2, 3)
fig.append_trace(trace2, 2, 2)
fig.append_trace(trace3, 1, 1)

fig['layout'].update(showlegend=True, title="Some City Distribuitions")
iplot(fig)

In [29]:
#First plot
trace0 = go.Box(
    x=df['type'],
    y=df['rent_median'], 
    showlegend=False
)

#Second plot
trace1 = go.Box(
    x=df['type'],
    y=df['family_median'], 
    showlegend=False
)

#Second plot
trace2 = go.Histogram(
    x=df['type'], histnorm="percent", 
    showlegend=False
)

trace3 = go.Scatter(
    x=df['rent_median'], 
    y=df['family_median'],
    showlegend=False,
    mode = 'markers'
)

#Creating the grid
fig = tls.make_subplots(rows=2, cols=3, specs=[[{}, {}, {}], [{'colspan': 3}, None, None]],
                          subplot_titles=("Rent Median by Type",
                                          "Type Count",
                                          "Family Median by Type", 
                                          "Rent Median x Family Median"))

#setting the figs
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 3)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 2, 1)

fig['layout'].update(showlegend=True, 
                     title="Some Type Distribuitions")

iplot(fig)

In [30]:
#First plot
trace0 = go.Box(
    x=df[df.state.isin(state_count[:10].index.values)]['state'],
    y=df[df.state.isin(state_count[:10].index.values)]['rent_median'],
    name="Top 10 States", showlegend=False
)

#Second plot
trace1 = go.Box(
    x=df[df.state.isin(state_count[:10].index.values)]['state'],
    y=df[df.state.isin(state_count[:10].index.values)]['hc_mortgage_median'],
    name="Top 15 Sucessful", showlegend=False
)

#Third plot
trace2 = go.Histogram(
    x=df[df.state.isin(state_count[:20].index.values)]['state'],
    histnorm='percent', name="Top 20 States's", showlegend=False             
)

#Creating the grid
fig = tls.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{'colspan': 2}, None]],
                          subplot_titles=('Rent Median TOP 10 States',
                                          'Mortage Median TOP 10 States', 
                                          "Top 20 Most Frequent States"))

#setting the figs
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 2, 1)

fig['layout'].update(showlegend=True, title="Top Frequency States")

iplot(fig)