In [None]:
import pandas as pd
import numpy as np
import time
import sqlalchemy as db
import unidecode

# Download and inspect Dataset

In [None]:
# Download data from storage, convert to a dataframe
# geo_df = pd.read_csv('https://jde03store.blob.core.windows.net/olist/olist_geolocation_dataset.csv')
geo_df = pd.read_csv('~\olist_geolocation_dataset.csv')
geo_df.info()

In [None]:
# Look at first 5 entries
geo_df.head()

In [None]:
# Zip code prefix should be 5 digits long with leading zeroes
geo_df['geolocation_zip_code_prefix'] = geo_df['geolocation_zip_code_prefix'].apply(lambda x: '{0:0>5}'.format(x))

In [None]:
# Change lat/lng fields to strings to check for duplicates
geo_df['geolocation_lat'] = geo_df['geolocation_lat'].apply(str)
geo_df['geolocation_lng'] = geo_df['geolocation_lng'].apply(str)
geo_df.info()

In [None]:
# Inspect unique values in city (1)
city = list(geo_df['geolocation_city'].unique())
print('Unique city values: ', geo_df['geolocation_city'].nunique(), '\n', city)

In [None]:
# Count duplicates
num_dupes = geo_df.duplicated()
print(num_dupes.sum() / len(geo_df))

In [None]:
# Inspect sample duplicates to verify
num_dupes = geo_df.duplicated(keep=False)
display(geo_df[num_dupes].sort_values(['geolocation_lat', 'geolocation_lng']))

In [None]:
# Drop duplicates
geo_df.drop_duplicates(inplace = True)
geo_df.info()

In [None]:
display(geo_df)

# Plenty of mis-spellings of city, means more dupes
# Check for dupes based on location info

In [None]:
# Can 1 city have multiple zip code prefixes? Yes
# Can there be duplicate zip code prefixes? Yes
subset = ['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']
loc_dupe = geo_df.duplicated(subset = subset, keep = False)
geo_df[loc_dupe].sort_values(by = 'geolocation_lat')

In [None]:
# Remove dupes
geo_df = geo_df.drop_duplicates(subset = subset, keep = 'first').reset_index(drop = True)
geo_df.info()

In [None]:
display(geo_df.sort_values('geolocation_zip_code_prefix'))

In [None]:
# Inspect unique values in city (2)
print('Unique city values: ', geo_df['geolocation_city'].nunique())

# Use city_names from customer table

In [None]:
# Import customers database
cust_df = pd.read_csv("C:\\Users\leepi\Downloads\Final Project Dataset\data\olist_customers_dataset.csv")
cust_df.info()

In [None]:
# Change field to string object to match geolocation table
cust_df['customer_zip_code_prefix'] = cust_df['customer_zip_code_prefix'].apply(str)

In [None]:
# Extract zip_code and city fields only
zip_df = cust_df[['customer_zip_code_prefix', 'customer_city']]

In [None]:
# Drop duplicates
zip = zip_df.drop_duplicates()

In [None]:
display(zip.sort_values('customer_zip_code_prefix'))

In [None]:
# Merge geo_df with zip with a 'left join'
merged_df = geo_df.merge(zip, how='left', left_on='geolocation_zip_code_prefix', right_on='customer_zip_code_prefix')

In [None]:
# Replace geolocation_city with customer_city only if customer_city exists
merged_df['geolocation_city'] = merged_df['customer_city'].combine_first(merged_df['geolocation_city'])

In [None]:
# Remove customer columns
merged_df.drop(columns=['customer_city', 'customer_zip_code_prefix'], inplace=True)

In [None]:
# Inspect unique values in city
city = list(merged_df['geolocation_city'].unique())
print('Unique city values: ', merged_df['geolocation_city'].nunique(), '\n', city)

In [None]:
# Write function to replace special characters (portuguese orthography)
def replace_special_characters(df, column):
    df[column] = df[column].apply(unidecode.unidecode)
    return df

In [None]:
# Run function on merged_df
replace_special_characters(merged_df, 'geolocation_city')

In [None]:
# Inspect unique values in city (1)
city = list(merged_df['geolocation_city'].unique())
print('Unique city values: ', merged_df['geolocation_city'].nunique(), '\n', city)

In [None]:
# Export to .csv file
merged_df.to_csv('geolocation_cleaned_merged.csv')

# Create table in db

In [None]:
# Connect to DB
engine = db.create_engine('url')
conn = engine.raw_connection()

In [None]:
# Create new table in PostgreSQL
commands = ('''CREATE TABLE IF NOT EXISTS geolocation(
);''')

# Initialize connection to PostgreSQL
cur = conn.cursor()

# Create cursor to execute SQL commands
#for command in commands:
cur.execute(commands)

# Commit changes
conn.commit()

In [None]:
# Copy data to table
geo_df.to_sql(name= 'geolocation', con = engine, if_exists= 'replace')

In [None]:
# Close communication with server
cur.close()
conn.close()