In [1]:
import numpy as np
import pandas as pd
from is_slum import get_slum_val, get_distance_from_slum # zach's functions
import matplotlib.pyplot as plt

In [2]:
%%capture
from tqdm.notebook import tqdm # progress bar
tqdm().pandas()

In [3]:
rio = pd.read_csv('../data/rio/cleaned_2.csv')
hyderabad = pd.read_csv('../data/hyderabad/cleaned.csv')
chennai = pd.read_csv('../data/chennai/cleaned.csv')
delhi = pd.read_csv('../data/new_delhi/cleaned.csv')
mumbai1 = pd.read_csv('../data/mumbai/cleaned_housing.com.csv')
mumbai2 = pd.read_csv('../data/mumbai/cleaned_99acres.com.csv')
mumbai = pd.concat([mumbai1[['lat', 'lng', 'price', 'coord']],
                    mumbai2[['lat', 'lng', 'price_per_month', 'coord']
                    ].rename(columns={'price_per_month':'price'})]).rename( # (my bad)
                             columns={'lat':'latitude','lng':'longitude'})

In [4]:
def Prepare(df):
    df['coord'] = list(zip(df['latitude'], df['longitude'])) 
    df.drop(columns=['latitude', 'longitude'], inplace=True)
    print(df.duplicated().sum(), 'duplicates dropped.')
    df.drop_duplicates(inplace=True)
    return df

In [5]:
rio = Prepare(rio)
mumbai = Prepare(mumbai)
hyderabad = Prepare(hyderabad)
chennai = Prepare(chennai)
delhi = Prepare(delhi)

137 duplicates dropped.
7488 duplicates dropped.
33 duplicates dropped.
194 duplicates dropped.
59 duplicates dropped.


### Classifying

In [14]:
def get_classes(df, city, distance=False):
    binary_map = dict()
    distance_map = dict()
    uniques = list(set(df['coord']))
    for coord in tqdm(uniques):
        
        try: # binary calculator
            binary_map[coord] = get_slum_val(city, (coord[0], coord[1]))
        except: binary_map[coord] = np.nan
        
        if distance==True:
            try: # distance calculator
                distance_map[coord] = get_distance_from_slum(city, (coord[0], coord[1]), 50)
            except: distance_map[coord] = np.nan
    
    print(len(uniques), 'unique locations classified. mapping back to dataset...')
    df['class'] = df.coord.progress_apply(lambda x: binary_map[x])
    
    if distance==True:
        df['class_distance'] = df.coord.progress_apply(lambda x: distance_map[x])
    
    print('COMPLETE. saving to CSV...')
    
    df.to_csv(f'classified_{city}.csv', index=False)
    
    print('saved. binary value counts:', df['class'].value_counts(normalize=True).values)

In [15]:
get_classes(rio, 'rio', distance=True)

HBox(children=(FloatProgress(value=0.0, max=1617.0), HTML(value='')))


1617 unique locations classified. mapping back to dataset...


HBox(children=(FloatProgress(value=0.0, max=7541.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7541.0), HTML(value='')))


COMPLETE. saving to CSV...
saved. binary value counts: [0.95574163 0.04425837]


In [71]:
get_classes(chennai, 'chennai')

HBox(children=(FloatProgress(value=0.0, max=968.0), HTML(value='')))


968 unique locations classified. mapping back to dataset...


HBox(children=(FloatProgress(value=0.0, max=2212.0), HTML(value='')))


COMPLETE. saving to CSV...
saved. binary value counts: [0.9773858 0.0226142]


Unnamed: 0,price,bathroom,coord,bhk,sqft,class
0,19000.0,,"(80.207, 12.760201400000001)",3.0,1150.0,0.0
1,8500.0,,"(80.20053100585938, 13.044207572937012)",1.0,,0.0
2,35000.0,,"(17.385044, 78.486671)",3.0,1600.0,0.0
3,15000.0,,"(80.21918487548828, 12.97907829284668)",2.0,950.0,0.0
4,15000.0,,"(12.8176758, 80.16256489999998)",2.0,970.0,0.0
...,...,...,...,...,...,...
2399,13000.0,,"(80.1443557739258, 12.916125297546401)",2.0,1100.0,0.0
2400,9000.0,,"(80.200927734375, 13.044439315795898)",2.0,900.0,0.0
2403,,,"(17.385044, 78.486671)",4.0,3700.0,0.0
2404,,,"(80.2532958984375, 13.04327392578125)",3.0,2650.0,0.0


In [None]:
get_classes(delhi, 'delhi')

HBox(children=(FloatProgress(value=0.0, max=1030.0), HTML(value='')))

In [None]:
get_classes(mumbai, 'mumbai')

In [None]:
get_classes(hyderabad, 'hyderabad')