# Load Replica Data and confrim

## Env

In [2]:
import pandas as pd
import numpy as np
import os

### Connections


In [3]:
# Postgres
from dotenv import load_dotenv

load_dotenv()

from sqlalchemy import create_engine
from sqlalchemy.engine import URL
from sqlalchemy import text
# os.chdir('Documents/Grad School/Berkeley/Classes/CYP_204D - Multivariate Analysis in Planning/Project/204d_final_project/')
user = "postgres"
password = os.getenv('POSTGRES_PW')
host = "127.0.0.1"
port = "5432"
database = "gradschool"

pg_url = URL.create(
    drivername='postgresql',
    username='postgres',
    host='127.0.0.1',
    database='gradschool',
    password=os.getenv('POSTGRES_PW')
)

pg_engine = create_engine(pg_url)
# Test connection
try:
    with pg_engine.connect() as connection_str:
        print('Successfully connected to the PostgreSQL database')
except Exception as e:
    print(f'Sorry failed to connect: {e}')

pg_connection = pg_engine.connect()


Successfully connected to the PostgreSQL database


## Load Data

### Replica

In [4]:
# Columns
COLS_RENAME_DICT = {
    'origin_bgrp_fips_2020': 'O_bg_fips',
    'destination_bgrp_fips_2020': 'D_bg_fips',
    'primary_mode': 'primary_mode', 
    'origin_bgrp_lng_2020': 'O_bg_lng',
    'origin_bgrp_lat_2020': 'O_bg_lat',
    'destination_bgrp_lat_2020': 'D_bg_lat',
    'destination_bgrp_lng_2020': 'D_bg_lng'
}

In [16]:
df_counts = {}
DATA_PATHS = {
    'phi' : '../data/replica_exports/replica-phi_sat_spring_2024-01_14_25-trips_dataset.csv',
    'sfba' : '../data/replica_exports/replica-ca_sat_spring_2024-01_14_25-trips_dataset.csv',
    'chi' : '../data/replica_exports/replica-chi_sat_spring_2024-01_14_25-trips_dataset.csv',
    'nyc' : '../data/replica_exports/replica-ny_sat_spring_2024-01_14_25-trips_dataset.csv'
}

#COL_DATA_TYPES = {'O_bg'}

#for metro in DATA_PATHS.keys():
for metro in ['phi','chi','nyc','sfba']:
    print(f'Loading metro {metro}..')
    temp = pd.read_csv(DATA_PATHS[metro]).rename(columns=COLS_RENAME_DICT).groupby(list(COLS_RENAME_DICT.values())).agg({'trip_distance_meters' : ['count','sum'], 'trip_duration_minutes' : ['sum','mean']})
    temp.columns = ['n_trips','trip_dist_m', 'trip_duration_min_total', 'trip_duration_min_mean']
    temp['metro'] = metro
    temp = temp.reset_index()
    temp['O_bg_fips'] = temp['O_bg_fips'].astype('str').str.pad(12, 'left','0')
    temp['D_bg_fips'] = temp['D_bg_fips'].astype('str').str.pad(12, 'left','0')
    temp = temp.set_index(['O_bg_fips', 'D_bg_fips','primary_mode'])

    try:
        temp.to_sql(f'{metro}_replica_grpd', schema = 'cyp204d_final_project', con = pg_engine.connect(), if_exists='replace')
        print('\tSuccess!')
    except Exception as e:
        print(f'\t{metro} failed with error: {e}')
    
# Then Alter columns manually in SQL

Loading metro phi..
	Success!
Loading metro chi..
	Success!
Loading metro nyc..
	Success!
Loading metro sfba..


FileNotFoundError: [Errno 2] No such file or directory: '../data/replica_exports/replica-ca_sat_spring_2024-01_14_25-trips_dataset.csv'

In [None]:
# temp = pd.read_csv(DATA_PATHS['phi'])
# temp

In [35]:
df_counts_raw = {}
DATA_PATHS = {
    'phi' : '../data/replica_exports/replica-phi_sat_spring_2024-01_14_25-trips_dataset.csv',
    'sfba' : '../data/replica_exports/replica-ca_sat_spring_2024-01_14_25-trips_dataset.csv',
    'chi' : '../data/replica_exports/replica-chi_sat_spring_2024-01_14_25-trips_dataset.csv',
    'nyc' : '../data/replica_exports/replica-ny_sat_spring_2024-01_14_25-trips_dataset.csv'
}
df_counts = {}
for m in DATA_PATHS.keys():
    df_counts_raw[m] = pd.read_csv(DATA_PATHS[m])
    df_counts[m] = df_counts_raw[m]['origin_bgrp_fips_2020', 'destination_bgrp_fips_2020', 'trip_distance_meters']

# temp_nyc[temp_nyc['trip_distance_meters'] < 100]

# import seaborn as sns
# #sns.histplot(temp_nyc, x = 'trip_distance_meters')
# #sns.histplot(temp_nyc, x = 'trip_distance_meters')

KeyError: ('origin_bgrp_fips_2020', 'destination_bgrp_fips_2020', 'trip_distance_meters')

In [14]:
#os.listdir('../data/replica_exports/replica-phi_sat_spring_2024-01_14_25-trips_dataset')
os.getcwd()

'/Users/jon/Documents/Grad School/Berkeley/Classes/CYP_204D - Multivariate Analysis in Planning/final_project_redo/code'

# BG Centroids

In [8]:
phi_centroids = pd.read_sql('select distinct "O_bg_fips", "O_bg_lng", "O_bg_lat" from cyp204d_final_project.phi_replica_grpd;', con = pg_connection)
chi_centroids = pd.read_sql('select distinct "O_bg_fips", "O_bg_lng", "O_bg_lat" from cyp204d_final_project.chi_replica_grpd;', con = pg_connection)
nyc_centroids = pd.read_sql('select distinct "O_bg_fips", "O_bg_lng", "O_bg_lat" from cyp204d_final_project.nyc_replica_grpd;', con = pg_connection)
sfba_centroids = pd.read_sql('select distinct "O_bg_fips", "O_bg_lng", "O_bg_lat" from cyp204d_final_project.sfba_replica_grpd;', con = pg_connection)

BG_centroids = pd.concat([phi_centroids, chi_centroids, nyc_centroids, sfba_centroids])

In [9]:
import pickle

with open('../data/temp/bg_centroids', 'wb') as fp:
    pickle.dump(BG_centroids, fp)