## Introduction

This notebook contains work related to getting the coordinates or house addresses present in file.

## Setup

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.utils import shuffle

In [2]:
import project

In [3]:
from us_houses_eda.get_coordinates import get_latitude, get_longitude

## Data

In [4]:
us_house_data = pd.read_parquet('../usa-house-csv/realtor-data.parquet')
us_house_data = shuffle(us_house_data[:50000]).reset_index(drop=True)

In [5]:
us_house_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   status        50000 non-null  object 
 1   price         50000 non-null  float64
 2   bed           38168 non-null  float64
 3   bath          38359 non-null  float64
 4   acre_lot      41839 non-null  float64
 5   full_address  50000 non-null  object 
 6   street        49945 non-null  object 
 7   city          49949 non-null  object 
 8   state         50000 non-null  object 
 9   zip_code      49818 non-null  float64
 10  house_size    37953 non-null  float64
 11  sold_date     9534 non-null   object 
dtypes: float64(6), object(6)
memory usage: 4.6+ MB


## Getting Coordinates

In [6]:
lats = us_house_data['city'].apply(get_latitude)

In [7]:
longs = us_house_data['city'].apply(get_longitude)

## Saving Lats and Longs as a DataFrame

In [14]:
lats_df = pd.DataFrame({'lats':lats.values})
longs_df = pd.DataFrame({'longs':longs.values})

## Concatenating the lats and longs to existing data frame

In [17]:
updated_df = pd.concat([us_house_data, lats_df, longs_df], axis=1)
updated_df.head()

Unnamed: 0,status,price,bed,bath,acre_lot,full_address,street,city,state,zip_code,house_size,sold_date,lats,longs
0,for_sale,95000.0,6.0,2.0,0.11,"15 Calle Santa Rosa De Lima, Hormigueros, PR, ...",15 Calle Santa Rosa De Lima,Hormigueros,Puerto Rico,660.0,2914.0,,18.132469,-67.113919
1,for_sale,335000.0,3.0,2.0,10.0,"229 River Rd, Whately, MA, 01093",229 River Rd,Whately,Massachusetts,1093.0,1277.0,,42.436948,-72.641517
2,for_sale,324900.0,4.0,3.0,2.0,"13 Rockwell Rd, Stafford, CT, 06076",13 Rockwell Rd,Stafford,Connecticut,6076.0,1332.0,2004-02-27,52.806316,-2.116382
3,for_sale,55000.0,,,8.0,"Westerly Side Southampton Rd, Westfield, MA, 0...",Westerly Side Southampton Rd,Westfield,Massachusetts,1085.0,,,42.139033,-72.758486
4,for_sale,49900.0,,,1.42,"205 Upper Church St Lot 3, Ware, MA, 01082",205 Upper Church St Lot 3,Ware,Massachusetts,1082.0,,,31.172723,-82.482934


## Saving the above Data Frame as Parquet

In [18]:
updated_df.to_parquet('../usa-house-csv/us_houses_data_lats_longs.parquet')