# Refining data
Filtering and parsing the raw csv files, saving new smaller and better formatted CSV files that will work better with QGIS and other tools.

Creating two sets of files:

### 1. Relevant Data Only
- Paring away anything not related to pedestrian and bike safety
- Matching neighborhood councils to each GPS coordinate
### 2. QGIS-friendly files
- Even smaller, restricted to just Wilshire NC area and slightly beyond, for performance.
- Contains the geometry fields QGIS needs.


## Workbook setup
Load modules, identify file and directory locations, define useful methods

In [None]:
import pandas as pd
import os
import geopandas as gpd
from sklearn.cluster import DBSCAN

try:
    os.chdir(os.path.dirname(os.path.realpath(__file__)))
except:
    pass

rawdatadir = '../Data/1-raw-data/'
rawroaddatacsv = 'raw-road-data.csv'
rawsigndatacsv = 'raw-sign-data.csv'
rawcollisiondatacsv = 'raw-collision-data.csv'
neighborhoodcouncilshapesgeojson = 'neighborhoodcouncils.geojson'

refineddatadir = '../Data/2-refined-data/'

### Geography Tools
Define methods to:

1. Identify which neighborhood council each data row falls under

2. Filter rows that are in or near the Wilshire NC.

#### Identify Councils

In [None]:
councilshapes = gpd.read_file(rawdatadir + neighborhoodcouncilshapesgeojson)
def identifyneighborhoodcouncil(geodataframe: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    return gpd.sjoin(
        geodataframe, councilshapes, how="left"
    ).rename(
        # More descriptive name for the neighborhood council field
        columns={'NAME': 'council'}
    ).drop(
        # Drop the neighborhood council fields we don't need.
        columns=[
            'index_right',
            'OBJECTID',
            'WADDRESS',
            'DWEBSITE',
            'DEMAIL',
            'DPHONE',
            'NC_ID',
            'CERTIFIED',
            'TOOLTIP',
            'NLA_URL',
            'SERVICE_RE'
        ]
    )

#### Filter for only data in Wilshire NC or neighborhing NCs.

In [None]:
def wilshireandadjacent(geodata: gpd.GeoDataFrame):
    councilstoinclude = [
        'GREATER WILSHIRE NC',
        'CENTRAL HOLLYWOOD NC',
        'HOLLYWOOD STUDIO DISTRICT NC',
        'MID CITY WEST CC',
        'OLYMPIC PARK NC',
        'P.I.C.O. NC',
        'WILSHIRE CENTER - KOREATOWN NC'
    ]
    indexesofwilshireandadjacentonly = geodata['council'].isin(councilstoinclude)
    filteredgeodata = geodata[indexesofwilshireandadjacentonly].copy()
    return filteredgeodata

# Creating CSV files of most relevant data only

## Mapillary files
We're limiting this to only the fields relevant to us, and further limiting it to objects assigned specific values by Mapillary relevant to us.

In [None]:
# Columns to keep
relevantmapillarycolumns = [
  "last_seen_at",
  "key",
  "value",
  "image_keys",
  "latitude",
  "longitude"
]

# Physical road feature categories we care about
physicalfeaturecategories = [
  "object--traffic-light--pedestrians",
  "marking--discrete--crosswalk-zebra",
  "construction--flat--crosswalk-plain"
]

# Sign categories we care about
signcategories =  [
  'regulatory--pedestrians-push-button--g1',
  'regulatory--pedestrians-push-button--g2',
  'regulatory--bicycles-push-button--g1',
  'regulatory--bicycles-push-button--g2',
  'regulatory--crosswalk-stop-on-red--g1',
  'regulatory--cross-only-on-pedestrian-signal--g1',
  'regulatory--cross-only-on-green--g1',
  'regulatory--crosswalk-stop-on-red--g1',
  'regulatory--use-crosswalk--g1',
  'regulatory--in-street-pedestrian-crossing--g1',
  'regulatory--turning-vehicles-yield-to-pedestrians--g1',
  'regulatory--stop-here-on-red-or-flashing-light--g1',
  'regulatory--stop-here-on-red-or-flashing-light--g2',
  'regulatory--pedestrians-priority-zone--g1',
  'warning--pedestrians-crossing--g4'
]

### Create filtered and reorganized files of Mapillary data

In [None]:
for [mapillarycsv, categories] in [
    [rawroaddatacsv, physicalfeaturecategories],
    [rawsigndatacsv, signcategories,]
]:
    # Filter only the relevant rows and columns
    rawdata = pd.read_csv(
        rawdatadir + mapillarycsv,
        usecols=relevantmapillarycolumns,
        converters={
            'last_seen_at': pd.to_datetime,
            'first_seen_at': pd.to_datetime
        }
    )
    relevantrowindexes = rawdata['value'].isin(categories)
    filtereddata = rawdata[relevantrowindexes]

    # Join with neighborhood council shapefile to add NC to rows
    filteredgeoframe = gpd.GeoDataFrame(
        filtereddata,
        geometry=gpd.points_from_xy(
            filtereddata['longitude'], filtereddata['latitude'], crs="epsg:4326"
        ),
        crs="epsg:4326"
    )
    filteredwcouncils = identifyneighborhoodcouncil(filteredgeoframe)

    # Save file of pedestrian-relevant data only
    filteredfilelocation = refineddatadir + mapillarycsv.replace('raw', 'refined')
    filteredwcouncils.to_csv(filteredfilelocation, index=False)
    print(f'Saved {len(filteredwcouncils)} filtered rows (of {len(rawdata)}) to {filteredfilelocation}')

    # Further filter for QGIS
    wilshireandadjacentonly = wilshireandadjacent(filteredwcouncils)

    # save QGIS file
    qgisfilelocation = refineddatadir + mapillarycsv.replace('raw', 'qgis')
    wilshireandadjacentonly.to_csv(qgisfilelocation, index=False)
    print(f'Saved {len(wilshireandadjacentonly)} rows near Wilshire to {qgisfilelocation}')





## Collision Data
We're filtering for only collisions assigned the following MO Codes by LAPD:
- 3003 (Veh vs Ped)
- 3008 (Veh vs Bike)
- 3016 (Bike vs Veh)
- 3501 (Ped Actions)

In [None]:
relevantmocodes = {
    '3003': 'Veh vs Ped',
    '3008': 'Veh vs Bike',
    '3016': 'Bike vs Veh',
    '3501': 'Ped Actions'
}
relevantcolumns = [
    'dr_no',
    'date_rptd',
    'date_occ',
    'time_occ',
    'mocodes',
    'vict_age',
    'vict_sex',
    'premis_desc',
    'location',
    'cross_street',
    'location_1',
    ':id'
]

In [None]:
rawcollisiondata = pd.read_csv(
    rawdatadir + rawcollisiondatacsv,
    usecols=relevantcolumns,
    parse_dates=['date_rptd', 'date_occ'],
    na_filter=False,
)


# Filter for only rows with matching MO Codes
relevantMOcodesrowindexes = rawcollisiondata['mocodes'].str.contains(
    '|'.join(relevantmocodes)
)
filteredcollisiondata = rawcollisiondata[relevantMOcodesrowindexes].copy()

# Make a True/False column for each relevant MO code
for mo in relevantmocodes:
  modescription = relevantmocodes[mo]
  filteredcollisiondata[modescription] = filteredcollisiondata['mocodes'].str.contains(mo)

# Parse the time occurred field, combine it with date_occ into a single datetime field
hours = pd.to_timedelta((filteredcollisiondata['time_occ'] / 100).astype(int), unit="hours")
minutes = pd.to_timedelta(filteredcollisiondata['time_occ'] % 100, unit="minutes")
filteredcollisiondata['date_occ'] = filteredcollisiondata['date_occ'] + hours + minutes
filteredcollisiondata.drop(columns='time_occ', inplace=True)


# Replace the the strangely formatted location column with separate latitude and longitude columns
filteredcollisiondata[['latitude', 'longitude']] = filteredcollisiondata[
    'location_1'
].str.strip('\n ,()').str.split(', ', expand=True).apply(pd.to_numeric)
filteredcollisiondata.drop(['location_1'], axis=1, inplace=True)

# Create a geodataframe of collisions
collisiongeoframe = gpd.GeoDataFrame(
    filteredcollisiondata,
    geometry=gpd.points_from_xy(
        filteredcollisiondata['longitude'], filteredcollisiondata['latitude'], crs="epsg:4326"
    ),
    crs="epsg:4326"
)

# add the neighborhood council info to the collision dataframe
collisionswithcouncils = identifyneighborhoodcouncil(collisiongeoframe)

# Identify clusters of datapoints
# (the dataset is clustered around intersections, so this essentially allows grouping by nearest intersection.)
collisionswithcouncils['cluster'] = DBSCAN(eps=.0006, min_samples=1).fit(
  collisionswithcouncils[['longitude', 'latitude']]
).labels_


# Save file of most relevant collision data only
refinedcollisiondatacsvlocation = refineddatadir + rawcollisiondatacsv.replace('raw', 'refined')
collisionswithcouncils.to_csv(
    refinedcollisiondatacsvlocation,
    index=False
)
print(
    f'Saved {len(collisionswithcouncils)} rows (of {len(rawcollisiondata)}) to {refinedcollisiondatacsvlocation}'
)

# Filter further to only the data needed for QGIS
qgiscollisiondata = wilshireandadjacent(collisionswithcouncils)

# Save QGIS file
qgiscollisiondatacsvlocation = refineddatadir + rawcollisiondatacsv.replace('raw', 'qgis')
qgiscollisiondata.to_csv(
    qgiscollisiondatacsvlocation,
    index=False
)
print(
    f'Saved {len(qgiscollisiondata)} rows (of {len(rawcollisiondata)}) to {qgiscollisiondatacsvlocation}'
)