In [1]:
import copy
import pathlib

import pandas as pd
from preprocessing.pipeline import (
    combine_data_NYPD,
    get_preprocessed_data,
    preprocess_311,
    process_crime_data,
    query_data,
)
from preprocessing.weather_parse import weather_parse


In [14]:
year = 2017
preprocessed_dir = pathlib.Path(f"../data/combined/year={year}")

try:
    df = pd.read_parquet(
        preprocessed_dir,
        )
    assert len(df) != 0
except AssertionError:
    df = get_preprocessed_data(
        start_date=year,
        end_date=year+1,
        sectors=False,
        opened_created_add=['borough', 'precinct', 'complaint_type'],
    ).convert_dtypes(dtype_backend='pyarrow')
    df.to_parquet(preprocessed_dir, partition_cols=['borough', 'precinct'])
except FileNotFoundError:
    preprocessed_dir.mkdir()
    df = get_preprocessed_data(
        start_date=year,
        end_date=year+1,
        sectors=False,
        opened_created_add=['borough', 'precinct', 'complaint_type'],
    ).convert_dtypes(dtype_backend='pyarrow')

    df.to_parquet(preprocessed_dir, partition_cols=['borough', 'precinct'])
except ValueError:
    boroughs = []
    for file_path in preprocessed_dir.glob('*'):

        if file_path.is_dir():
            try:
                boroughs.append(pd.read_parquet(file_path))
                # print(df)
            except Exception:
                print(file_path)

    df = pd.concat(boroughs)
df

Querying 311 data...


Unnamed: 0,borough,precinct,complaint_type,created_H,closed_H,created_date,closed_date,agency,descriptor,status,...,precip_period_hrs,precip_accumulation_mm,direction_deg,speed_mps,dew_temperature_c,year,date_H,FELONY,MISDEMEANOR,VIOLATION
0,BRONX,40.0,Noise - Residential,2018-01-01 00:00:00,2018-01-01 12:00:00,2018-01-01 00:53:55,2018-01-01 12:36:09,NYPD,Loud Music/Party,Closed,...,,,320,8.2,-18.3,2018,2018-01-01 00:00:00,7.0,8.0,2.0
1,BRONX,40.0,Noise - Residential,2018-01-01 00:00:00,2018-01-01 13:00:00,2018-01-01 00:23:08,2018-01-01 13:20:33,NYPD,Loud Music/Party,Closed,...,,,320,8.2,-18.3,2018,2018-01-01 00:00:00,7.0,8.0,2.0
2,BRONX,42.0,Noise - Residential,2018-01-01 00:00:00,2018-01-01 03:00:00,2018-01-01 00:11:41,2018-01-01 03:49:38,NYPD,Loud Music/Party,Closed,...,,,320,8.2,-18.3,2018,2018-01-01 00:00:00,1.0,8.0,1.0
3,BRONX,43.0,Blocked Driveway,2018-01-01 00:00:00,2018-01-01 01:00:00,2018-01-01 00:58:30,2018-01-01 01:52:26,NYPD,Partial Access,Closed,...,,,320,8.2,-18.3,2018,2018-01-01 00:00:00,6.0,15.0,5.0
4,BRONX,44.0,Noise - Residential,2018-01-01 00:00:00,2018-01-01 10:00:00,2018-01-01 00:37:19,2018-01-01 10:39:12,NYPD,Loud Music/Party,Closed,...,,,320,8.2,-18.3,2018,2018-01-01 00:00:00,11.0,13.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1576302,QUEENS,104.0,Noise - Residential,2019-12-31 23:00:00,2019-12-31 23:00:00,2019-12-31 23:11:03,2019-12-31 23:47:57,NYPD,Loud Music/Party,Closed,...,,,,,,,,,,
1576303,QUEENS,108.0,Noise - Residential,2019-12-31 23:00:00,2019-12-31 23:00:00,2019-12-31 23:09:15,2019-12-31 23:14:38,NYPD,Loud Music/Party,Closed,...,,,,,,,,,,
1576304,QUEENS,112.0,Illegal Parking,2019-12-31 23:00:00,2019-12-31 23:00:00,2019-12-31 23:01:19,2019-12-31 23:20:27,NYPD,Blocked Hydrant,Closed,...,,,,,,,,,,
1576305,QUEENS,112.0,Noise - Residential,2019-12-31 23:00:00,2019-12-31 23:00:00,2019-12-31 23:18:27,2019-12-31 23:20:06,NYPD,Loud Music/Party,Closed,...,,,,,,,,,,


In [None]:
import datetime
import pathlib

import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from lifelines import CoxPHFitter
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sksurv.ensemble import (
    ComponentwiseGradientBoostingSurvivalAnalysis,
    GradientBoostingSurvivalAnalysis,
    RandomSurvivalForest,
)
from statsmodels.duration.hazard_regression import PHReg, PHRegResults
import multiprocessing as mp
from preprocessing.pre_survival import strata_threshold_remove, process_categorical

ImportError: cannot import name 'strata_threshold_remove' from 'preprocessing.pre_survival' (c:\Users\Morri\Documents\Notebooks\Capstone\src\preprocessing\pre_survival.py)

In [6]:
n_jobs = int(mp.cpu_count() * 0.9)
seed = 14
rsf = RandomSurvivalForest(max_depth=3, min_samples_leaf=50, n_jobs=n_jobs, random_state=14)

In [None]:
remove_cols = {
    'hours_to_complete', 'descriptor', 'resolution_description',
       'resolution_action_updated_date',
       'incident_zip', 'city', 'bbl','status',
       'closed_H','created_date',
       'closed_date','sector',
    #    'latitude', 'longitude',
       'due_date','created','date_H',
       'created', 'created_bo', 'created_ag', 'created_co',
       'created_bo_ag', 'open', 'open_bo', 'open_ag', 'open_co',
       'open_bo_ag','open_bo_co','precip_period_hrs', 'precip_accumulation_mm','direction_deg',
    #    'created_bo_co','temperature_c','speed_mps','dew_temperature_c'

}

In [8]:
X = df[[c for c in df.columns if c not in remove_cols]]
rsf.fit(X, df.hours_to_complete)

ValueError: could not convert string to float: 'BRONX'