# Advanced Coding Final Project

Written by Jiyun Tsai  
Data imported as of November 22nd, 2022

## Import Data

In [5]:
# Import data using pandas

import pandas as pd
import requests

In [6]:
# pip install sodapy

In [7]:
# Get API code instructions from (https://dev.socrata.com/foundry/data.sfgov.org/ab4h-6ztd)

# import pandas as pd
# from sodapy import Socrata


# client = Socrata("data.sfgov.org",
#                  "UBJcOCUaACnecZ7XJXpUT4tiG",
#                  username="jiyun_tsai@berkeley.edu",
#                  password="Password1022!")

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
# results = client.get("ab4h-6ztd", limit="") ## how to get full data without limits?

# Convert to pandas DataFrame
# citation = pd.DataFrame.from_records(results)

In [8]:
# Read the file into Python
url = 'https://data.sfgov.org/api/views/ab4h-6ztd/rows.csv?accessType=DOWNLOAD'
r = requests.get(url, allow_redirects=False)

# Write the content of the request into a file called 'parking_citation.csv'
open('parking_citation.csv', 'wb').write(r.content)

2792873651

In [9]:
# Import this .csv file to dataframe named "citation"
citation = pd.read_csv('parking_citation.csv')

In [10]:
citation

Unnamed: 0,Citation Number,Citation Issued DateTime,Violation,Violation Description,Citation Location,Vehicle Plate State,Vehicle Plate,Fine Amount,Date Added,geom,Neighborhoods,SF Find Neighborhoods,Current Police Districts,Current Supervisor Districts,Analysis Neighborhoods
0,771857995,10/27/2009 03:36:00 PM,T32A.1,TWAWY ZN#1,669 MISSION ST,CA,6ANP484,83.0,10/27/2009 12:00:00 AM,POINT (-122.40131583199997 37.78670342300006),32.0,32.0,1.0,10.0,8.0
1,771104246,09/14/2009 11:05:00 AM,V5204A,REG TABS,7 GROVE ST,CA,5FEK647,63.0,09/14/2009 12:00:00 AM,POINT (-122.41511664599994 37.77876024500006),21.0,21.0,5.0,10.0,36.0
2,770561131,09/10/2009 03:30:00 PM,T202,PRK METER,2851 24TH ST,CA,6DEG966,53.0,09/10/2009 12:00:00 AM,POINT (-122.40919778399996 37.752743407000025),53.0,53.0,3.0,2.0,20.0
3,770024076,09/07/2009 03:21:00 PM,T202,PRK METER,417 BAY ST,CA,6DKY758,53.0,09/07/2009 12:00:00 AM,POINT (-122.41413599899994 37.80540600300003),106.0,106.0,6.0,3.0,23.0
4,770025093,09/09/2009 01:23:00 PM,T58A,BLK WHEELS,136 LAWTON ST,CA,4JTL645,48.0,09/09/2009 12:00:00 AM,POINT (-122.46446999799997 37.75869600400006),109.0,109.0,10.0,8.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19037870,957069643,11/09/2022 01:20:00 PM,V5200,NO PLATES,2417 CLAY ST,CA,8AZF222,121.0,11/21/2022 12:00:00 AM,,,,,,
19037871,957026814,11/09/2022 07:10:00 AM,TRC7.2.22,STR CLEAN,899 TURK ST,FL,23AUUE,87.0,11/21/2022 12:00:00 AM,,,,,,
19037872,PD32608030,10/26/2022 01:52:00 PM,TRC7.2.42,RESTRICTED,LARKIN STREET/TURK STREET,CA,8SMY175,108.0,11/21/2022 12:00:00 AM,,,,,,
19037873,957037502,11/09/2022 01:04:00 PM,V5200,NO PLATES,1468 VALENCIA ST,CA,OVR0PEC,121.0,11/21/2022 12:00:00 AM,,,,,,


In [11]:
citation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19037875 entries, 0 to 19037874
Data columns (total 15 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   Citation Number               object 
 1   Citation Issued DateTime      object 
 2   Violation                     object 
 3   Violation Description         object 
 4   Citation Location             object 
 5   Vehicle Plate State           object 
 6   Vehicle Plate                 object 
 7   Fine Amount                   float64
 8   Date Added                    object 
 9   geom                          object 
 10  Neighborhoods                 float64
 11  SF Find Neighborhoods         float64
 12  Current Police Districts      float64
 13  Current Supervisor Districts  float64
 14  Analysis Neighborhoods        float64
dtypes: float64(6), object(9)
memory usage: 2.1+ GB


In [12]:
# See 'Violation' types
citation['Violation'].value_counts()

TRC7.2.22     4740572
T37C          2397076
TRC7.2.23B    1315531
T202          1271179
TRC7.2.20     1125595
               ...   
V21107.8A           1
GO1.H.4             1
T315D               1
000181              1
T32.6.7             1
Name: Violation, Length: 305, dtype: int64

## Street Cleaning Data in 5 Years (2018-2022)
For street cleaning, the dataframe contains two codes in 'Violation' column: TRC7.2.22 and T37C, all data after 2018 use TRC7.2.22  
Filter dataframe before cleaning data for the whole dataframe is too large (19M rows of data)

*code description and more info could be found [here](https://www.sfmta.com/sites/default/files/reports-and-documents/2022/10/fy_2023_fees_and_fines_effective_7.1.22_1.pdf)

### Filter and Clean

In [13]:
# Filter data of street cleaning 'TRC7.2.22' 
citation_st_cleaning = citation[citation['Violation']== 'TRC7.2.22'].copy()

In [14]:
# Convert dtype for 'Citation Issued DateTime'
citation_st_cleaning['Citation Issued DateTime'] = pd.to_datetime(citation_st_cleaning['Citation Issued DateTime'])

In [15]:
# Filter data from 2018 to 2022 from citation_st_cleaning dataframe
citation_st_cleaning_5yr = citation_st_cleaning[citation_st_cleaning['Citation Issued DateTime'] >= '2018-01-01'].copy()

In [16]:
# Sort data by date and time
citation_st_cleaning_5yr.sort_values(by=['Citation Issued DateTime'])

Unnamed: 0,Citation Number,Citation Issued DateTime,Violation,Violation Description,Citation Location,Vehicle Plate State,Vehicle Plate,Fine Amount,Date Added,geom,Neighborhoods,SF Find Neighborhoods,Current Police Districts,Current Supervisor Districts,Analysis Neighborhoods
13854842,892597786,2018-01-02 00:03:00,TRC7.2.22,STR CLEAN,7 04TH ST,CA,8AGE461,73.0,10/15/2021 12:00:00 AM,POINT (-122.46585481399995 37.788244665000036),4.0,4.0,8.0,6.0,11.0
13862408,892536632,2018-01-02 00:08:00,TRC7.2.22,STR CLEAN,333 08TH ST,CA,7XSH956,73.0,09/24/2018 12:00:00 AM,POINT (-122.03654390799994 38.24835797800006),,,,,
13897742,892597790,2018-01-02 00:08:00,TRC7.2.22,STR CLEAN,869 FOLSOM ST,CA,6TZE100,73.0,04/30/2020 12:00:00 AM,POINT (-122.40263889499994 37.78079616600007),32.0,32.0,1.0,10.0,34.0
13871600,892597801,2018-01-02 00:09:00,TRC7.2.22,STR CLEAN,839 FOLSOM ST,CA,7XBP248,73.0,06/16/2018 12:00:00 AM,POINT (-122.40196506399997 37.781329424000035),32.0,32.0,1.0,10.0,34.0
13647894,892257855,2018-01-02 00:10:00,TRC7.2.22,STR CLEAN,225 TOWNSEND ST,CA,6CWY869,73.0,04/12/2021 12:00:00 AM,POINT (-122.39345305299997 37.77822802800006),34.0,34.0,1.0,10.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2835369,957285862,2022-11-16 12:44:00,TRC7.2.22,STR CLEAN,1601 BRODERICK STREET,CA,6XCJ079,87.0,11/22/2022 12:00:00 AM,,,,,,
2843303,957212222,2022-11-16 12:45:00,TRC7.2.22,STR CLEAN,1501 BRODERICK ST,CA,7JUR685,87.0,11/22/2022 12:00:00 AM,,,,,,
2819924,957148264,2022-11-16 13:12:00,TRC7.2.22,STR CLEAN,1160 REVERE AVENUE,CA,5P65273,87.0,11/22/2022 12:00:00 AM,,,,,,
2837276,957240712,2022-11-16 13:18:00,TRC7.2.22,STR CLEAN,2444 15TH AVE,CO,FLT388S,87.0,11/22/2022 12:00:00 AM,,,,,,


In [17]:
# Check duplicates
citation_st_cleaning_5yr.nunique()

Citation Number                 2376891
Citation Issued DateTime         619908
Violation                             1
Violation Description                 1
Citation Location                599825
Vehicle Plate State                 100
Vehicle Plate                    970294
Fine Amount                          13
Date Added                         1241
geom                             245357
Neighborhoods                       117
SF Find Neighborhoods               117
Current Police Districts             10
Current Supervisor Districts         11
Analysis Neighborhoods               41
dtype: int64

In [18]:
len(citation_st_cleaning_5yr) ## there are no duplicated rows for 'Citation Number'

2376891

In [19]:
# Export filtered and cleaned dataframe as .csv for future analysis, data as of November 22, 2022
citation_st_cleaning_5yr.to_csv('citation_st_cleaning_5yr.csv', index=False)

Cleaned data (.csv file) can be accessed at [Google Drive](https://drive.google.com/drive/folders/1SJ0jPphZWaPy7h72a2X3uj4rI_nGH8hH?usp=share_link)