# Load Replica Data and confrim

## Env

In [3]:
import pandas as pd
import numpy as np
import os

### Connections


In [12]:
# Postgres
from dotenv import load_dotenv

load_dotenv()

from sqlalchemy import create_engine
from sqlalchemy.engine import URL
from sqlalchemy import text
# os.chdir('Documents/Grad School/Berkeley/Classes/CYP_204D - Multivariate Analysis in Planning/Project/204d_final_project/')
user = "postgres"
password = os.getenv('POSTGRES_PW')
host = "127.0.0.1"
port = "5432"
database = "gradschool"

pg_url = URL.create(
    drivername='postgresql',
    username='postgres',
    host='127.0.0.1',
    database='gradschool',
    password=os.getenv('POSTGRES_PW')
)

pg_engine = create_engine(pg_url)
# Test connection
try:
    with pg_engine.connect() as connection_str:
        print('Successfully connected to the PostgreSQL database')
except Exception as e:
    print(f'Sorry failed to connect: {e}')

pg_connection = pg_engine.connect()


Successfully connected to the PostgreSQL database


## Load Data

### Replica

In [5]:
# Columns
COLS_RENAME_DICT = {
    'origin_bgrp_fips_2020': 'O_bg_fips',
    'destination_bgrp_fips_2020': 'D_bg_fips',
    'primary_mode': 'primary_mode', 
    'origin_bgrp_lng_2020': 'O_bg_lng',
    'origin_bgrp_lat_2020': 'O_bg_lat',
    'destination_bgrp_lat_2020': 'D_bg_lat',
    'destination_bgrp_lng_2020': 'D_bg_lng'
}

In [None]:
df_counts = {}
DATA_PATHS = {
    'phi' : '../data/replica_exports/replica-phi_sat_spring_2024-01_14_25-trips_dataset/replica-phi_sat_spring_2024-01_14_25-trips_dataset.csv',
    'sfba' : '../data/replica_exports/replica-sfba_sat_spring_2024-01_14_25-trips_dataset/replica-ca_sat_spring_2024-01_14_25-trips_dataset.csv',
    'chi' : '../data/replica_exports/replica-chi_sat_spring_2024-01_14_25-trips_dataset/replica-chi_sat_spring_2024-01_14_25-trips_dataset.csv',
    'nyc' : '../data/replica_exports/replica-ny_sat_spring_2024-01_14_25-trips_dataset/replica-ny_sat_spring_2024-01_14_25-trips_dataset.csv'
}

for metro in DATA_PATHS.keys():
    print(f'Loading metro {metro}..')
    df = pd.read_csv(DATA_PATHS[metro])
    df_counts = df.rename(columns = COLS_RENAME_DICT).groupby(list(COLS_RENAME_DICT.values())).agg({'trip_distance_meters' : ['count','sum'], 'trip_duration_minutes' : 'sum'}) 
    df_counts['metro'] = metro
    
    df_counts.columns = ['n_trips','trip_dist_m','trip_duration_min','metro']

    try:
        df_counts.reset_index().set_index(['O_bg_fips', 'D_bg_fips','primary_mode']).to_sql(f'{metro}_replica_grpd', schema = 'cyp204d_final_project', con = pg_connection, if_exists='replace')
        print('\tSuccess!')
    except Exception as e:
        print(f'\t{metro} failed with error: {e}')
    
# Then Alter columns manually in SQL

Loading metro phi..
	Success!
Loading metro sfba..
	Success!
Loading metro chi..
	Success!
Loading metro nyc..
	Success!


In [5]:
# Combine
df_combined = pd.concat([df_nyc_counts, df_sfba_counts, df_chi_counts, df_phi_counts])

In [16]:
#df_chi_grpd = df_chi.groupby(['origin_bgrp_fips_2020','destination_bgrp_fips_2020','primary_mode']).agg({'trip_distance_meters' : ['count','sum'], 'trip_duration_minutes' : 'sum'}) # 3.8M
#df_chi[['origin_bgrp_lat_2020','origin_bgrp_lng_2020','destination_bgrp_lat_2020','destination_bgrp_lng_2020']]

Unnamed: 0,destination_bgrp_lat_2020
0,42.0176
1,42.0159
2,42.0015
3,41.9672
4,41.9444
...,...
16348482,41.7860
16348483,41.7860
16348484,41.7860
16348485,41.7860


NameError: name 'df_combined' is not defined

# BG Centroids

In [None]:
phi_centroids = pd.read_sql('select distinct "O_bg_fips", "O_bg_lng", "O_bg_lat" from cyp204d_final_project.phi_replica_grpd;', con = pg_connection)
chi_centroids = pd.read_sql('select distinct "O_bg_fips", "O_bg_lng", "O_bg_lat" from cyp204d_final_project.chi_replica_grpd;', con = pg_connection)
nyc_centroids = pd.read_sql('select distinct "O_bg_fips", "O_bg_lng", "O_bg_lat" from cyp204d_final_project.nyc_replica_grpd;', con = pg_connection)
sfba_centroids = pd.read_sql('select distinct "O_bg_fips", "O_bg_lng", "O_bg_lat" from cyp204d_final_project.sfba_replica_grpd;', con = pg_connection)

BG_centroids = pd.concat([phi_centroids, chi_centroids, nyc_centroids, sfba_centroids])

In [20]:
import pickle

with open('../data/temp/bg_centroids', 'wb') as fp:
    pickle.dump(BG_centroids, fp)