In [1]:
#import necessary dependencies 
import pandas as pd
import numpy as np
import requests
from sodapy import Socrata


In [17]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.austintexas.gov", None)

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
#Runs fast if you just get a sample of 2000, I tried using "get_all" instead of "get" and removing the limit,
#but after 30 minutes it still had not finished. Probably best to just download the crime data. 
results = client.get("fdj4-gpfu", limit=2000)

# Convert to pandas DataFrame
data = pd.DataFrame.from_records(results)



In [12]:
#read in csv 
data.shape

(2000, 31)

In [None]:
#code to view all rows
pd.set_option("display.max_rows", None)
#removing spaces in column names 
data.columns = data.columns.str.replace(' ','_')
#preview data
data.sample(5)

Unnamed: 0,Incident Number,Highest Offense Description,Highest Offense Code,Family Violence,Occurred Date Time,Occurred Date,Occurred Time,Report Date Time,Report Date,Report Time,...,Census Tract,Clearance Status,Clearance Date,UCR Category,Category Description,X-coordinate,Y-coordinate,Latitude,Longitude,Location
658251,20063610581,THEFT,600,N,12/27/2006 10:20:00 AM,12/27/2006,1020.0,12/27/2006 10:27:00 AM,12/27/2006,1027.0,...,8.0,,05/24/2007,23H,Theft,3117639.0,3117639.0,30.271098,-97.732186,"(30.27109764, -97.73218643)"
1590250,2007210432,THEFT,600,N,01/21/2007 04:23:00 AM,01/21/2007,423.0,01/21/2007 04:23:00 AM,01/21/2007,423.0,...,13.0,C,01/27/2007,23H,Theft,3105949.0,3105949.0,30.227483,-97.770385,"(30.22748277, -97.77038546)"
993231,2017311292,CRIMINAL TRESPASS,2716,N,01/31/2017 04:11:00 PM,01/31/2017,1611.0,01/31/2017 04:11:00 PM,01/31/2017,1611.0,...,12.0,,,,,3109626.0,3109626.0,30.275222,-97.757467,"(30.27522226, -97.75746653)"
2089381,2019671420,CRIMINAL TRESPASS,2716,N,03/08/2019 04:22:00 PM,03/08/2019,1622.0,03/08/2019 04:22:00 PM,03/08/2019,1622.0,...,21.12,C,03/12/2019,,,3133344.0,3133344.0,30.305167,-97.681487,"(30.30516695, -97.68148699)"
1968859,20065007037,BURGLARY OF VEHICLE,601,N,02/07/2006 07:30:00 PM,02/07/2006,1930.0,02/07/2006 10:39:00 PM,02/07/2006,2239.0,...,11.0,,05/12/2006,23F,Theft,3112462.0,3112462.0,30.265594,-97.748738,"(30.26559361, -97.74873775)"
200289,20102531740,RUNAWAY CHILD,4100,N,09/10/2010 05:45:00 PM,09/10/2010,1745.0,09/10/2010 05:45:00 PM,09/10/2010,1745.0,...,24.12,N,09/13/2010,,,3113763.0,3113763.0,30.18749,-97.746708,"(30.18749017, -97.74670817)"
623271,20052361086,AUTO THEFT,700,N,08/14/2005 01:00:00 PM,08/14/2005,1300.0,08/24/2005 01:56:00 PM,08/24/2005,1356.0,...,3.0,C,09/02/2005,240,Auto Theft,3124212.0,3124212.0,30.311194,-97.71027,"(30.3111939, -97.71026952)"
629681,20185024708,THEFT,600,N,06/30/2018 11:10:00 AM,06/30/2018,1110.0,06/30/2018 11:10:00 AM,06/30/2018,1110.0,...,23.0,N,07/10/2018,23H,Theft,3117278.0,3117278.0,30.248501,-97.733939,"(30.24850066, -97.73393875)"
498599,20212610502,AUTO THEFT,700,N,09/17/2021 11:30:00 PM,09/17/2021,2330.0,09/18/2021 08:51:00 AM,09/18/2021,851.0,...,330.0,N,09/21/2021,240,Auto Theft,3068306.0,3068306.0,30.230774,-97.889523,"(30.23077391, -97.88952263)"
891524,20192451722,BURGLARY OF VEHICLE,601,N,09/02/2019 10:37:00 PM,09/02/2019,2237.0,09/02/2019 10:37:00 PM,09/02/2019,2237.0,...,16.0,N,09/23/2019,23F,Theft,3111366.0,3111366.0,30.303317,-97.751201,"(30.30331736, -97.75120142)"


In [None]:
#check data types
data.dtypes

Incident Number                  int64
Highest Offense Description     object
Highest Offense Code             int64
Family Violence                 object
Occurred Date Time              object
Occurred Date                   object
Occurred Time                  float64
Report Date Time                object
Report Date                     object
Report Time                    float64
Location Type                   object
Address                         object
Zip Code                       float64
Council District               float64
APD Sector                      object
APD District                    object
PRA                             object
Census Tract                   float64
Clearance Status                object
Clearance Date                  object
UCR Category                    object
Category Description            object
X-coordinate                   float64
Y-coordinate                   float64
Latitude                       float64
Longitude                

In [None]:
#create a year column for occurred date time 
data['year'] = pd.DatetimeIndex(data['Occurred Date Time']).year

In [None]:
#filter dataframe to be between 2018-2021
data = data[data['year']>=2018]
data = data[data['year']<=2021]

433

In [None]:
#how many unique values are there in highest offense description?
data['Highest Offense Description'].nunique()

In [None]:
#check value counts for each column
data['Highest Offense Description'].value_counts()

In [None]:
#check null values
#do we want to drop null values or do we want to drop the columns? 
data.isnull().sum(axis = 0)

Incident Number                     0
Highest Offense Description         0
Highest Offense Code                0
Family Violence                     0
Occurred Date Time                  0
Occurred Date                       0
Occurred Time                       0
Report Date Time                    0
Report Date                         0
Report Time                         0
Location Type                    1127
Address                             0
Zip Code                         2682
Council District                 4472
APD Sector                       1692
APD District                     1775
PRA                              2187
Census Tract                     3747
Clearance Status                47729
Clearance Date                  47739
UCR Category                   249545
Category Description           249545
X-coordinate                     2080
Y-coordinate                     2080
Latitude                         7493
Longitude                        7493
Location    

In [None]:
#drop unneccessary columns for our analysis 
clean_df= data.drop(["Council_District","APD_Sector","APD_District","PRA"],axis = 1)

In [None]:
#checking null valules of latitude -- why does rape disapear when latitude null values are dropped? 
lat_lookup = clean_df.loc[clean_df["Latitude"].isnull()].copy()

In [None]:
#looking at occurances of rape within null latitude values 
lat_lookup["Category_Description"].value_counts()

Rape                  2215
Aggravated Assault     128
Theft                   65
Burglary                54
Robbery                 20
Auto Theft               4
Name: Category_Description, dtype: int64

In [None]:
#dropping nulls of important columns -- dropping latitude will remove rape occurances 
clean_df = clean_df.dropna(axis=0, subset=["Location_Type","Zip_Code","Census_Tract"
                                           ,"Latitude"])


In [None]:
clean_df["Category_Description"].unique()

array(['Theft', nan, 'Burglary', 'Aggravated Assault', 'Rape', 'Robbery',
       'Auto Theft', 'Murder'], dtype=object)

In [None]:
(clean_df["Category_Description"] == "Rape").count()

400299

In [None]:
#replacing nulls for UCR_Category with zero 
clean_df[["UCR_Category"]] = clean_df[["UCR_Category"]].fillna('0')

In [None]:
#replacing nulls with "None"
clean_df[["Category_Description"]] = clean_df[["Category_Description"]].fillna('None')
#replacing nulls with "U" for unkown 
clean_df[["Clearance_Status"]] = clean_df[["Clearance_Status"]].fillna('U')
#replacing nulls with a random date so its obvious on visualizaitons 
clean_df[["Clearance_Date"]] = clean_df[["Clearance_Date"]].fillna('01/01/1888')

In [None]:
#check nulls
clean_df.isnull().sum(axis = 0)

In [None]:
#convert occurred date time to datetime dtype
df['Occurred_Date_Time'] =  pd.to_datetime(df['Occurred_Date_Time'])
#create an hour column for the ocurred time
df['hour'] = pd.DatetimeIndex(df['Occurred_Date_Time']).hour
df['hour'].head()


In [None]:
clean_df["Category_Description"].unique()

array(['Theft', 'None', 'Burglary', 'Aggravated Assault', 'Rape',
       'Robbery', 'Auto Theft', 'Murder'], dtype=object)

In [None]:
len(clean_df)

400299

In [None]:
#convert clean_df to csv 
clean_df.to_csv(r'Resources/2018To2021CrimeData_clean.csv')