In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import time
import geopy

from pyzipcode import ZipCodeDatabase, db_location

zdb = ZipCodeDatabase()

In [2]:
cols = [
    #"location_key",
    "state",
    "county",
    #"region",
    #"zip",
    "lat",
    "long",
]
cols = list(map(lambda x: "location_" + x, cols))

In [3]:
location_dim = pd.DataFrame(columns=cols)

In [4]:
location_dim.shape

(0, 4)

In [5]:
PATH = "../stage_1/data/processedData/"

- Vehicles dataset

In [6]:
df = pd.read_csv(f"{PATH}/vehicles.csv", index_col=0)

In [7]:
df = df[["lat", "long", "state"]]

In [8]:
df.shape

(157431, 3)

In [9]:
df = df.drop_duplicates()

In [10]:
col_map = {
    col:"location_"+col for col in df.columns
}

In [11]:
df.columns = df.columns.map(col_map)

In [12]:
location_dim = pd.concat((location_dim, df)).reset_index(drop=True).drop_duplicates()

In [13]:
location_dim.shape

(37511, 4)

- Used cars

In [14]:
df = pd.read_csv(f"{PATH}/used_car_sales.csv")
df.head()

Unnamed: 0,ID,pricesold,yearsold,zipcode,Mileage,Make,Model,Year,Engine,BodyType,NumCylinders,DriveType,datesold,lat,long,state,fuel
0,137178,7500,2020,78611,84430,ford,mustang,1988.0,5.0l gas v8,sedan,8,RWD,2020-03-19 00:00:00,30.767327,-98.30109,tx,gas
1,64287,44000,2019,17728,40703,porsche,911,2002.0,3.6l,coupe,6,AWD,2019-11-13 00:00:00,41.318934,-77.08211,pa,gas
2,132695,950,2020,46211,71300,mercury,montclair,1965.0,,sedan,0,RWD,2020-04-04 00:00:00,39.779492,-86.132837,in,gas
3,80293,25200,2019,33759,15000,pontiac,gto,1970.0,,,0,,2019-01-23 00:00:00,27.980297,-82.71645,fl,gas
4,158271,20000,2020,33311,51674,jeep,wrangler,2015.0,3.6l flexible v6,suv,6,4WD,2020-08-09 00:00:00,26.13883,-80.16865,fl,gas


In [15]:
df = df[["lat", "long", "state"]]

In [16]:
col_map = {
    col:"location_"+col for col in df.columns
}

In [17]:
df.columns = df.columns.map(col_map)

In [18]:
dflocation_dim = pd.concat((location_dim, df)).reset_index(drop=True).drop_duplicates()

In [19]:
location_dim.shape

(37511, 4)

- Tn_mvr

In [20]:
df = pd.read_csv(f"{PATH}/tn_mvr.csv", index_col=0)
df.head()

Unnamed: 0,vin,price,odometer_type,mileage,county,zip,model_year,make,model,vehicle_type,new_used,title_issue_date,purchase_date,lat,long,state,fuel
1013,137ZA8434TE173571,31000.0,1,0.0,Tipton,38053,1996,am-general,hummer,AUTO,U,2019-01-17,2019-01-04,35.347965,-89.90668,tn,gas
1768,137ZA8434TE173571,11000.0,1,0.0,Shelby,38115,1996,am-general,hummer,AUTO,U,2018-01-30,2018-01-12,35.057311,-89.86291,tn,gas
11221,19UDE2F36LA000634,21500.0,0,4843.0,Washington,37681,2020,acura,ilx,AUTO,N,2021-02-02,2021-01-23,36.246547,-82.62095,tn,gas
11225,19UDE2F71HA001328,17253.99,0,35691.0,Sumner,37075,2017,acura,ilx,AUTO,U,2020-05-27,2020-04-18,36.311047,-86.61173,tn,gas
11226,19UDE2F72HA006487,4000.0,0,27000.0,Franklin,37330,2017,acura,ilx,AUTO,U,2019-05-20,2019-05-17,35.280936,-86.12247,tn,gas


In [21]:
df = df[["lat", "long", "state"]]

In [22]:
df.shape

(535786, 3)

In [23]:
df = df.drop_duplicates()
df.shape

(1090, 3)

In [24]:
col_map = {
    col:"location_"+col for col in df.columns
}

In [25]:
df.columns = df.columns.map(col_map)

In [26]:
location_dim = pd.concat((location_dim, df)).reset_index(drop=True)

In [27]:
location_dim.shape

(38601, 4)

In [28]:
location_dim = location_dim.drop_duplicates()
location_dim.shape

(38601, 4)

- infer the other fields

In [29]:
location_dim.head().dtypes

location_state      object
location_county     object
location_lat       float64
location_long      float64
dtype: object

In [30]:
geolocator = geopy.Nominatim(user_agent='http')

In [31]:
from tqdm import tqdm

In [32]:
def get_location(lat, long):
    location = geolocator.reverse((lat, long))
    return location

def get_countys(df):
    countys=[]
    for i in tqdm(df.index):
        item = location_dim.loc[i]
        loc = get_location(item.location_lat, item.location_long)
        if loc == None:
            countys.append(np.nan)
        else:
            if "county" in loc.raw["address"]:
                countys.append(loc.raw["address"]["county"])
            else:
                countys.append(np.nan)
    return countys

In [33]:
get_location(35.347965,	-89.90668).raw

{'place_id': 78581177,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'osm_type': 'node',
 'osm_id': 7604103059,
 'lat': '35.3475584',
 'lon': '-89.9060082',
 'display_name': 'Goodwill, US 51, Millington, Shelby County, West Tennessee, Tennessee, 38053, United States',
 'address': {'shop': 'Goodwill',
  'road': 'US 51',
  'town': 'Millington',
  'county': 'Shelby County',
  'region': 'West Tennessee',
  'state': 'Tennessee',
  'ISO3166-2-lvl4': 'US-TN',
  'postcode': '38053',
  'country': 'United States',
  'country_code': 'us'},
 'boundingbox': ['35.3475084', '35.3476084', '-89.9060582', '-89.9059582']}

In [46]:
#countys = get_countys(location_dim)

In [None]:
localtion_dim.fillna("Unkown", inplace=True)

In [33]:
location_dim.to_csv("data/locationDim.csv")