# 311 Data Dive

In [1]:
#------------------------------------------------#
# Dependencies
#------------------------------------------------#
import os
import csv
import pandas as pd
import numpy as np
import requests
import random

In [2]:
#------------------------------------------------#
# CSV Time
#------------------------------------------------#

# Store the CSV filepath in a variable
the311 = "Resources/311SI2018.csv"

# Now, read the data from the 311 CSV into a DataFrame
the311_pd = pd.read_csv(the311, sep=',', error_bad_lines=False, index_col=False, dtype='unicode')

# Preview the DataFrame, but only 5 rows
the311_pd.head(5)

# Oh, look! There's some NaN's →→→→→→→→→→→→
#                                                                                                  ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,...,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Latitude,Longitude,Location
0,41280106,12/27/2018 8:00,12/31/2018 12:57,DSNY,Department of Sanitation,Electronics Waste,missed collection for E-waste,Sidewalk,10306,176 BURBANK AVENUE,...,,,,,,,,40.5746716,-74.11288975,"(40.57467159968207, -74.11288975334158)"
1,41280107,12/27/2018 5:43,12/31/2018 12:53,DSNY,Department of Sanitation,Electronics Waste,missed collection for E-waste,Sidewalk,10314,80 ELMIRA AVENUE,...,,,,,,,,40.61987812,-74.1288768,"(40.61987811855055, -74.12887680202681)"
2,41280177,12/27/2018 19:08,12/31/2018 13:02,DSNY,Department of Sanitation,Electronics Waste,missed collection for E-waste,Sidewalk,10312,279 GRANTWOOD AVENUE,...,,,,,,,,40.55223947,-74.1824105,"(40.552239472961354, -74.18241050471549)"
3,38733801,3/19/2018 11:46,4/21/2018 8:50,DOF,Personal Exemption Unit,DOF Property - Reduction Issue,Personal SCHE Exemption,"1-, 2- and 3- Family Home",10308,,...,,,,,,,,,,
4,40155737,8/31/2018 11:52,9/4/2018 13:04,DSNY,Department of Sanitation,Electronics Waste,Recycling Electronics,Sidewalk,10305,70 FATHER CAPODANNO BOULEVARD,...,,,,,,,,40.59438732,-74.06292307,"(40.59438732080405, -74.06292307439475)"


In [3]:
#------------------------------------------------#
# Time to check out the data!
#------------------------------------------------#

# Get the counts
the311_pd.count()

Unique Key                        151177
Created Date                      151177
Closed Date                       144696
Agency                            151177
Agency Name                       151177
Complaint Type                    151177
Descriptor                        150642
Location Type                     112199
Incident Zip                      149352
Incident Address                  123623
Street Name                       123622
Cross Street 1                    108072
Cross Street 2                    107842
Intersection Street 1              24160
Intersection Street 2              24160
Address Type                      149220
City                              149357
Landmark                              35
Facility Type                      41221
Status                            151177
Due Date                           57955
Resolution Description            135826
Resolution Action Updated Date    148855
Community Board                   151177
BBL             

In [4]:
# Check out the NaNs so you can remove them!
the311_pd.isnull().sum()

Unique Key                             0
Created Date                           0
Closed Date                         6481
Agency                                 0
Agency Name                            0
Complaint Type                         0
Descriptor                           535
Location Type                      38978
Incident Zip                        1825
Incident Address                   27554
Street Name                        27555
Cross Street 1                     43105
Cross Street 2                     43335
Intersection Street 1             127017
Intersection Street 2             127017
Address Type                        1957
City                                1820
Landmark                          151142
Facility Type                     109956
Status                                 0
Due Date                           93222
Resolution Description             15351
Resolution Action Updated Date      2322
Community Board                        0
BBL             

In [5]:
# Check out the data types, too!
the311_pd.dtypes

Unique Key                        object
Created Date                      object
Closed Date                       object
Agency                            object
Agency Name                       object
Complaint Type                    object
Descriptor                        object
Location Type                     object
Incident Zip                      object
Incident Address                  object
Street Name                       object
Cross Street 1                    object
Cross Street 2                    object
Intersection Street 1             object
Intersection Street 2             object
Address Type                      object
City                              object
Landmark                          object
Facility Type                     object
Status                            object
Due Date                          object
Resolution Description            object
Resolution Action Updated Date    object
Community Board                   object
BBL             

In [6]:
#------------------------------------------------#
# It's Cleaning Time, Coderella ♣♣♣
#------------------------------------------------#

# Make a DataFrame and only include the columns you need to use.

clean311_pd = the311_pd[['Unique Key', 
                          'Created Date', 
                          'Agency Name',
                          'Complaint Type',
                          'Descriptor',
                          'Location Type',
                          'Landmark',
                          'Borough',
                          'Incident Zip', 
                          'Latitude', 
                          'Longitude',
                          'Location']]

clean311_df = pd.DataFrame(clean311_pd)

# Check it out
clean311_df.isnull().sum()

Unique Key             0
Created Date           0
Agency Name            0
Complaint Type         0
Descriptor           535
Location Type      38978
Landmark          151142
Borough                0
Incident Zip        1825
Latitude            6950
Longitude           6950
Location            6950
dtype: int64

In [15]:
# K, now exit all the NaNs and preview
clean311_df['Location'].dropna()
clean311_df = clean311_df.dropna()
clean311_df.head()

Unnamed: 0,Unique Key,Created Date,Agency Name,Complaint Type,Descriptor,Location Type,Landmark,Borough,Incident Zip,Latitude,Longitude,Location
1965,39982795,8/11/2018 9:35,Department of Parks and Recreation,DPR Internal,Aided/Injury,Beach,MIDLAND BEACH,STATEN ISLAND,10306,40.5680522,-74.08991124,"(40.5680521959103, -74.08991123736692)"
13141,40542228,10/14/2018 18:41,New York City Police Department,Noise - Street/Sidewalk,Loud Music/Party,Street/Sidewalk,FERRY TERMINAL,STATEN ISLAND,10301,40.6441118,-74.0725587,"(40.644111798496276, -74.07255869954668)"
13193,40545192,10/14/2018 16:40,New York City Police Department,Noise - Street/Sidewalk,Loud Music/Party,Street/Sidewalk,FERRY TERMINAL,STATEN ISLAND,10301,40.6441118,-74.0725587,"(40.644111798496276, -74.07255869954668)"
19203,39060647,4/29/2018 10:19,New York City Police Department,Illegal Parking,Posted Parking Sign Violation,Street/Sidewalk,GREAT KILLS,STATEN ISLAND,10306,40.540936,-74.13497407,"(40.54093600401107, -74.13497406598722)"
50610,39302063,5/27/2018 14:09,Department of Parks and Recreation,Animal in a Park,Removing Wildlife,Street/Curbside,BLUE HERON PARK PRESERVE,STATEN ISLAND,10312,40.53183007,-74.17466719,"(40.53183007271259, -74.1746671915421)"


In [16]:
# # Yay! It worked! Now, let's write the info into a new CSV
clean311_df.to_csv("311SI2018Data.csv", encoding="utf-8", index=False)