# Generates the following files:
* See commented out `to_pickle` lines.

1. route_geoms_df.pkl
2. lxlgrids_davison.pkl
3. XDSegIds_for_all_trips.pkl

### Requires `inrix_grouped.pkl`  
    * This is an external file that lists all the road segment information covered by inrix.  

### These 2 are just variations of the `triplevel_df.parquet`, which is generated in the `day_ahead` and `any_day` files.
    1. triplevel_df_processed_time_window_with_IDs.pickle
    2. triplevel_df_processed_MAIN_NOTEBOOK.pickle

In [None]:
import os
os.chdir("..")
os.chdir("..")
print(os.getcwd())
import matplotlib.pyplot as plt
import geopandas as gpd
import pandas as pd
import numpy as np

from tqdm import tqdm
from shapely.geometry import Polygon, LineString
import warnings
warnings.filterwarnings('ignore')
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import Row, SparkSession
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F
from pyspark import SparkConf
import pandas as pd
import pickle
from tqdm import tqdm
spark = SparkSession.builder.config('spark.executor.cores', '8').config('spark.executor.memory', '80g')\
        .config("spark.sql.session.timeZone", "UTC").config('spark.driver.memory', '80g').master("local[26]")\
        .appName("wego-daily").config('spark.driver.extraJavaOptions', '-Duser.timezone=UTC').config('spark.executor.extraJavaOptions', '-Duser.timezone=UTC')\
        .config("spark.sql.datetime.java8API.enabled", "true").config("spark.sql.execution.arrow.pyspark.enabled", "true")\
        .config("spark.sql.autoBroadcastJoinThreshold", -1)\
        .config("spark.driver.maxResultSize", 0)\
        .config("spark.shuffle.spill", "true")\
        .config("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")\
        .config("spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")\
        .config("spark.ui.showConsoleProgress", "false")\
        .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

# Generates 1x1 mile^2 grids across Nashville

In [None]:
fp = os.path.join('data', 'shapefiles', "tncounty")
gdf_county = gpd.read_file(fp)
gdf_county.plot()
gdf_dav = gdf_county[gdf_county["NAME"] == "Davidson"]
gdf_david = gdf_dav.to_crs("EPSG:4326")
gdf_david.crs
xmin, ymin, xmax, ymax = gdf_dav.total_bounds
gdf_dav.total_bounds
length = 5280
wide = 5280

cols = list(np.arange(xmin, xmax + wide, wide))
rows = list(np.arange(ymin, ymax + length, length))

polygons = []
for x in cols[:-1]:
    for y in rows[:-1]:
        polygons.append(Polygon([(x,y), (x+wide, y), (x+wide, y+length), (x, y+length)]))

grid = gpd.GeoDataFrame({'geometry':polygons})
fp = os.path.join('data', 'shapefiles', 'grid_shapes.shp')
grid.to_file(fp)
grid.plot(ax = gdf_dav.plot(color='blue'), color='none', edgecolor='red')
grids = grid.set_crs("EPSG:2274")
dav_grids = gpd.overlay(gdf_dav, grids, how='intersection')


dav_grids['row_num'] = np.arange(len(dav_grids))
dav_grids2 = dav_grids.to_crs("EPSG:4326")

# fp = os.path.join('data', '1x1grids_davidson.pkl')
# dav_grids2.to_pickle(fp)

# Gets all route geometries, probably easier from GTFS but it did not have all files i need (or i missed it)

In [None]:
f = os.path.join('..', '..', 'data', 'processed_parquet_JP_all')
apcdata = spark.read.load(f)
apcdata = apcdata.sort("arrival_time")\
            .select("transit_date", "trip_id", "map_longitude", "map_latitude")\
            .groupby('transit_date', 'trip_id')\
            .agg(F.collect_list("map_longitude").alias("map_longitude"), F.collect_list("map_latitude").alias("map_latitude"))

apcdata = apcdata.drop("transit_date")
apcdata = apcdata.dropDuplicates(['trip_id'])
def create_lineString(x):
    geometry = [xy for xy in zip(x.map_longitude, x.map_latitude)]
    return LineString(geometry)

apcdf = apcdata.toPandas()
apcdf['geometry'] = apcdf.apply(lambda x: create_lineString(x), axis=1)
apcdf = apcdf.set_geometry('geometry')
apcdf = apcdf.drop(columns=['map_longitude', 'map_latitude'], axis=1)
# fp = os.path.join('data', 'route_geoms_df.pkl')
# apcdf.to_pickle(fp)

# Gets all XDSegIDs (Inrix segments) used in the trips

In [None]:
# Load inrix segment data
fp = os.path.join('data', 'inrix_grouped.pkl')
with open(fp, "rb") as fh:
  inrix_segment_df = pickle.load(fh)

inrix_segment_df = inrix_segment_df.set_geometry('geometry')
inrix_segment_df = inrix_segment_df[inrix_segment_df['County_inrix'] == 'davidson']
davidson_segs = inrix_segment_df.XDSegID.unique().tolist()

In [None]:
# Rtree would be faster
def find_grids_intersecting(gdf, linestring):
    spatial_index = gdf.sindex
    possible_matches_index = list(spatial_index.intersection(linestring.bounds))
    possible_matches = gdf.iloc[possible_matches_index]
    precise_matches = possible_matches[possible_matches.intersects(linestring)]
    return precise_matches

In [None]:
# Generate/load 1x1 mile grids
fp = os.path.join('data', '1x1grids_davidson.pkl')
grids_df = pd.read_pickle(fp)
grids_df = grids_df.set_geometry('geometry')

In [None]:
# Get route linestring data
fp = os.path.join('data', 'route_geoms_df.pkl')
trip_id_geom_data = pd.read_pickle(fp)
trip_id_geom_data.head(1)

In [None]:
# Getting segments in trips
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import cpu_count
# Get APC data
fp = os.path.join('data', 'triplevel_df_processed_MAIN_NOTEBOOK.pickle')
df = pd.read_pickle(fp)
df = df.dropna()
df = df.drop_duplicates(subset=['trip_id', 'route_id_direction'], keep='first')
df = df.reset_index(drop=True)

CORES = cpu_count()

def merge_cluster(idx):
    trip_ids = df.iloc[idx].trip_id.tolist()
    all_used_segments = []
    for trip_id in trip_ids:
        route_linestring = trip_id_geom_data[trip_id_geom_data['trip_id'] == trip_id]['geometry'].values[0]
        if route_linestring is None: 
            print("trip id LS not found.")
        
        route_grids = find_grids_intersecting(grids_df, route_linestring)
        if route_grids.empty: 
            print("route grids for trip not found.")
        
        route_segments = inrix_segment_df[inrix_segment_df['geometry'].within(route_grids.unary_union)]['XDSegID'].tolist()
        if len(route_segments) == 0: 
            print("route segments for trip not found.")
        
        all_used_segments = list(set(all_used_segments + route_segments))
    return all_used_segments
o_index_group = np.array_split(df.index, CORES)

with ThreadPoolExecutor(max_workers=CORES) as pool:
    results = pool.map(merge_cluster, o_index_group)
results = list(results)
out = []
[out.extend(r) for r in results]
results = list(set(out))
fp = os.path.join('data', 'XDSegIDs_for_all_trips.pkl')
with open(fp, 'wb') as f:
    pickle.dump(results, f)