In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("housing.csv")

In [3]:
print(df.shape)
df.head()

(20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


# Using Geohashes for Analysis

In [5]:
import geohash_hilbert as gh
#converting lat,lon to geohashes of precision 4
df["geohash"]=[gh.encode(float(df["longitude"].iloc[i]), float(df["latitude"].iloc[i]), precision = 4) for i in range(df.shape[0])]

In [6]:
df1 = df.groupby("geohash", as_index = False).mean()

In [7]:
df1["rectangle"] = [gh.rectangle(df1["geohash"].iloc[i])["geometry"]["coordinates"][0] for i in range(df1.shape[0])]

In [8]:
#defining custom function to convert dataset to geojson format
def df_to_geojson(df, properties):
    g = []
    
    for _, row in df.iterrows():

        geojson = {'type':'FeatureCollection', 'features':[]}    
        feature = {'type':'Feature',
                   'id': str(_),
                   'properties':{},
                   'geometry':{'type':'Polygon','coordinates':[]}}
        feature['geometry']['coordinates'] = [[[float(x[0]),float(x[1])]for x in row["rectangle"][:-1]]]
        
        feature['properties'][properties[0]] = row[properties[0]]
        feature['properties'][properties[1]] = row[properties[1]]
        feature['properties'][properties[2]] = row[properties[2]]
        feature['properties'][properties[3]] = row[properties[3]]
        feature['properties'][properties[4]] = row[properties[4]]
        feature['properties'][properties[5]] = row[properties[5]]
        feature['properties'][properties[6]] = row[properties[6]]
        feature['properties'][properties[7]] = 'red' if row['population'] < df["population"].mean() else 'blue'
        
        geojson['features'].append(feature)

        g.append(geojson)
    return g

cols = ['geohash', 'total_rooms','total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'color']
x1 = df_to_geojson(df1, cols)

# Geographical Plotting using Folium(0.10.1)

In [9]:
import folium
from folium import plugins

map1=folium.Map(location=[df["latitude"].iloc[-1], df["longitude"].iloc[-1]],zoom_start=3)

#defining layer control
fg = folium.FeatureGroup(name='Housing Ananlysis', control = False)
g1 = plugins.FeatureGroupSubGroup(fg, 'Geohash level Ananlysis')
g2 = plugins.FeatureGroupSubGroup(fg, 'Datpoint Level Analysis')

for x in x1:
    #adding geojson to the map
    g1.add_child(
        folium.GeoJson(
            data=x,
            name='gh',
            style_function=lambda x :{
                "fillColor": x["properties"]["color"],
                "color": x["properties"]["color"],
                'fillOpacity':0.7
            },
            highlight_function=lambda f: {"fillcolor": f["properties"]["color"], "color": f["properties"]["color"]}
        ).add_child(
            folium.Popup(
            str('total_bedrooms = '+str(x["features"][0]["properties"]["total_bedrooms"])+ 
                ' total_rooms = '+str(x["features"][0]["properties"]["total_rooms"])+
                ' Population = '+str(x["features"][0]["properties"]["population"])+ 
                ' Households = '+str(x["features"][0]["properties"]["households"])+
                ' Median_income = '+str(x["features"][0]["properties"]["median_income"])+
                ' Median_house_value = '+str(x["features"][0]["properties"]["median_house_value"])
                )
                ))
    )    

for i in range(df.shape[0]):
    #adding points to the map
    g2.add_child(folium.Circle(
                radius=5,
                location=[df["latitude"].iloc[i], df["longitude"].iloc[i]],
                popup=str('total_bedrooms = '+str(df["total_bedrooms"].iloc[i])+ 
                          ' total_rooms = '+str(df["total_rooms"].iloc[i])+
                          ' Population = '+str(df["population"].iloc[i])+ 
                          ' Households = '+str(df["households"].iloc[i])+
                          ' Median_income = '+str(df["median_income"].iloc[i])+
                          ' Median_house_value = '+ str(df["median_house_value"].iloc[i])
                         ),
                color='red' if df['population'].iloc[i] < df["population"].mean() else 'blue',
                fill=True
        )
    )      
    
map1.add_child(g1)
map1.add_child(g2)
map1.add_child(fg)

folium.LayerControl(collapsed=False).add_to(map1)
map1.save(outfile='geographical_analysis.html')

# Calculating Distances using Geopy.distance

In [10]:
#At lots of time we need to calculate the distance between the two points on map 
# For above housing dataset we do not need the distances but for the learning purpose lets see how to calculate distances

In [11]:
df["next_latitude"] = df["latitude"].shift(-1)
df["next_longitude"] = df["longitude"].shift(-1)

In [12]:
df2 = df[["latitude", "longitude", "next_latitude", "next_longitude"]]
df2 = df2.iloc[:-1,:]

In [13]:
df2.head()

Unnamed: 0,latitude,longitude,next_latitude,next_longitude
0,37.88,-122.23,37.86,-122.22
1,37.86,-122.22,37.85,-122.24
2,37.85,-122.24,37.85,-122.25
3,37.85,-122.25,37.85,-122.25
4,37.85,-122.25,37.85,-122.25


In [14]:
import geopy.distance
from tqdm import tnrange
df2["distance"] = [geopy.distance.distance((df2["latitude"].iloc[i], df2["longitude"].iloc[i]), (df2["next_latitude"].iloc[i], df2["next_longitude"].iloc[i])).km for i in tnrange(df2.shape[0])]

HBox(children=(IntProgress(value=0, max=20639), HTML(value='')))




In [15]:
df2.head()

Unnamed: 0,latitude,longitude,next_latitude,next_longitude,distance
0,37.88,-122.23,37.86,-122.22,2.387896
1,37.86,-122.22,37.85,-122.24,2.080846
2,37.85,-122.24,37.85,-122.25,0.880111
3,37.85,-122.25,37.85,-122.25,0.0
4,37.85,-122.25,37.85,-122.25,0.0
