In [3]:
""" Setting up libraries for project
    
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import folium as fm # Map Library: will only work with Jupyter since it needs a browser
from folium.plugins import FastMarkerCluster # To add map markers
from pyproj import Proj, transform


""" Settings for numpy and Pandas
"""
#Numpy:
np.set_printoptions(threshold = np.inf, linewidth = 500, suppress = True)

#Pandas
pd.set_option("display.max_rows"	, 	3000)
pd.set_option("display.max_columns"	, 	3000)
pd.set_option("display.width"		,	1000)
pd.set_option("display.float_format", 	lambda x: "%.3f' % x")




In [8]:
"""Parsing for the CSV, read in all data from CSV 
"""
def ParseFile(file_in, file_out, batchsize):
    """ 
    Recieve CSV file, parse data, then output new CSV organized with wanted data
    Parameter: file_in, file_out, batchsize (for larger CSVs if wanted >1mill entries)
    Return: Outputs new CSV with desired data, nothing returned from function
    """
    # Datafram for CSV
    d = pd.read_csv(file_in, delimiter = ',', header = 0, nrows = 0, dtype = str)
    dat_recent = pd.DataFrame(data = d)

    # The columns that we won't be using can be dropped
    dat_recent = dat_recent.drop(['School Zone', 'Construction Zone', 'Case Closed', 'Agency', 'Officer Code'], axis=1)

    i = 0
    not_done = True
    # Do in batches, if more than 1 million samples
    while not_done:

        print('processing batch', i, ', samples processed: ', i * batchsize)

        # load in batches of 1-million entries for processing per pass
        dat = pd.read_csv(file_in, delimiter = ',', header = 0, nrows = batchsize, skiprows = range(1, i*batchsize),
                          dtype = str)

        # Drop the columns that we don't need to save on space
        dat = dat.drop(['School Zone', 'Construction Zone', 'Case Closed', 'Agency', 'Officer Code'], axis=1)


        # if the batch has less than 1-million entries then we know that this is the last pass
        i+=1
        if len(dat) < batchsize:
            not_done = False

        # replace the emply fields with "0"
        dat = dat.replace(np.nan, '0')

        # Extract 2018 stuff
        dat = dat[dat['Offense Date'].str.contains('2018')]
        dat_recent = dat_recent.append(dat)

    # print(dat_recent)
    dat_recent.to_csv(file_out)
    print('done, new .csv saved as', file_out)
    



In [15]:
"""Main Program
    Runs mapping library
"""

# To keep track of how long it takes
start_time = time.time()

# No need to run either if the csv's are already made
batchsize = 150000
trim_data = True

if trim_data == True:
    trimData('Municipal_Court_Caseload_Information_FY_2018.csv', 'Output2018.csv', batchsize)

# Load in the cleaned data as its own dataframe to work with
print('Loading data...')
working_data = pd.read_csv('Output2018.csv', delimiter=',', header=0, dtype = object)

# Rename the first column which is duplicated upon loading the csv (csv saves the index and loading it in adds an index)
working_data = working_data.rename(columns = {'Unnamed: 0' : 'Index'})

print('Data loaded, producing figures and maps...')

# - - - - - - - - - - - - - - - - For the map: - - - - - - - - - - - - - - - -

# Take the coorinates as their own dataframe so it can be manipulated, no need to throw out the rows with bad
# coordinate data from the full data set since they may contain other useful info

cbatch = 40000 # Number of previous incidents to plot, anything over this will bog down the map
coords = (working_data.loc[(len(working_data) - cbatch):, Offense Street Name + 'Austin']).astype(string)


# coords are in x/y and we want lat/long, this is from the pyproj documentation
pm = '+proj=lcc +lat_1=34.03333333333333 +lat_2=35.46666666666667 +lat_0=33.5 +lon_0=-118 +x_0=2000000 ' \
     '+y_0=500000.0000000002 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs'

# convert to lat/long
x_in,y_in = coords['Latitude'].values, coords['Longitude'].values
lat,long = transform(Proj(pm, preserve_units = True), Proj("+init=epsg:4326"), x_in,y_in)

LA_coords = [34.05 , -118.24]
m = fm.Map(location=LA_coords, zoom_start=10.5)

# add map markers, plots as "long/lat" rather than "lat/long"
FastMarkerCluster(data=list(zip(long, lat))).add_to(m)
display(m) 

SyntaxError: invalid syntax (<ipython-input-15-7ccbb1f4396d>, line 39)

processing batch 0 , samples processed:  0


ValueError: invalid literal for int() with base 10: '/2'