## Business Intelligence Assignment 2: Data Analytics


### (3) Geo-spacial Data ZIP-encoding

In [3]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
from shapely.errors import ShapelyDeprecationWarning
import warnings
import os

### (a) Loading Data

In [4]:
cwd = os.getcwd()

data_folder = os.path.join(cwd, 'data\\mid_processing')

file_path = os.path.join(data_folder, 'AB_US_2023_DATA_CLEANED.csv')
data_processed = pd.read_csv(file_path, sep=",", dtype={'neighbourhood_group': 'str'}, low_memory=False,index_col='id')

data_processed = data_processed.copy()
display(data_processed.head(3))

Unnamed: 0_level_0,name,host_id,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,city
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
958,"Bright, Modern Garden Unit - 1BR/1BTH",1169,Western Addition,37.77028,-122.43317,Entire home/apt,202,2,383,2023-02-19,2.31,1,128,59,San Francisco
5858,Creative Sanctuary,8904,Bernal Heights,37.74474,-122.42089,Entire home/apt,235,30,111,2017-08-06,0.66,1,365,0,San Francisco
8142,Friendly Room Apt. Style -UCSF/USF - San Franc...,21994,Haight Ashbury,37.76555,-122.45213,Private room,56,32,9,2022-10-27,0.09,13,365,1,San Francisco


### (b) Encoding

In [None]:
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning)

zcta_shapefile_path = r'PATH_TO_DOWNLOADED_FILE_tl_2024_us_zcta520.shp'
zcta = gpd.read_file(zcta_shapefile_path)
zcta = zcta.to_crs(epsg=4326)
zcta.sindex


def get_zip_code(latitude, longitude):
    try:
        # point object or long / lat parameters
        point = Point(longitude, latitude)
        possible_matches_index = list(zcta.sindex.intersection(point.bounds))
        possible_matches = zcta.iloc[possible_matches_index]

        # identify point in dataset
        for idx, row in possible_matches.iterrows():
            if row['geometry'].contains(point):
                return row['ZCTA5CE20']  # return zipcode

        # error handling
        return None
    except Exception as e:
        print(f"Error processing point ({latitude}, {longitude}): {e}")
        return None
    

data_processed['neighbourhood'] = data_processed.apply(lambda row: get_zip_code(row['latitude'], row['longitude']), axis=1)
print(data_processed, data_processed['neighbourhood'])
data_processed['neighbourhood'] = pd.to_numeric(data_processed['neighbourhood'])
data_processed['neighbourhood'] = data_processed['neighbourhood'].fillna(-1)  # Replace NaN with -1
data_processed['neighbourhood'] = data_processed['neighbourhood'].astype(int)


data_processed = data_processed[data_processed['neighbourhood'] != -1]

print(data_processed['neighbourhood'].dtype)



                                                                 name  \
id                                                                      
958                             Bright, Modern Garden Unit - 1BR/1BTH   
5858                                               Creative Sanctuary   
8142                Friendly Room Apt. Style -UCSF/USF - San Franc...   
8339                                  Historic Alamo Square Victorian   
8739                              Mission Sunshine, with Private Bath   
...                                                               ...   
849230448624862502  Private Half Moon Bay Oasis By Pillar Point Be...   
850805127216414630                                      Home near SFO   
851562104616413652                   Modern New Private Studio in SFO   
851792795339759410            Lux Bedroom with a Shared Bath near SFO   
851801296770555568             Lux Bedroom with Private Bath near SFO   

                      host_id neighbourhood   lati

### (c) Saving Encoded Data

In [None]:
data_processed.to_csv(r'PATH_TO_FILE\AB_US_2023_PROCESSED.csv')


In [21]:
# Please DO NOT use this if you do not have the ZCTA data 
#data_processed.to_csv(r'C:\Users\Noah\PycharmProjects\BusinessIntelligence_Submission\data\AB_US_2023_TEMPLATE.csv')
