# 311 Data Dive

In [1]:
#------------------------------------------------#
# Dependencies
#------------------------------------------------#
import os
import csv
import pandas as pd
import numpy as np
import json 
import requests
import random
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
from matplotlib.font_manager import FontProperties


In [2]:
#------------------------------------------------#
# CSV Time
#------------------------------------------------#

# Store the CSV filepath in a variable
the311 = "Resources/311_Service_Requests_from_2010_to_Present (1).csv"

# Now, read the data from the 311 CSV into a DataFrame
the311_pd = pd.read_csv(the311, sep=',', error_bad_lines=False, index_col=False, dtype='unicode')

# Preview the DataFrame, but only 5 rows
the311_pd.head(5)

# Oh, look! There's some NaN's →→→→→→→→→→→→
#                                                                                                  ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,...,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Latitude,Longitude,Location
0,38077835,1/1/2018 0:00,10/19/2017 0:00,DOHMH,Department of Health and Mental Hygiene,Rodent,Rat Sighting,Other (Explain Below),11206,148 TOMPKINS AVENUE,...,,,,,,,,40.69400562,-73.94606675,"(40.69400562370198, -73.94606674967356)"
1,38076542,1/1/2018 0:00,11/27/2017 0:00,DOHMH,Department of Health and Mental Hygiene,Rodent,Rat Sighting,3+ Family Apt. Building,10463,3446 FORT INDEPENDENCE STREET,...,,,,,,,,40.88158191,-73.89898944,"(40.88158190996572, -73.89898943540017)"
2,38072873,1/1/2018 0:00,12/14/2017 0:00,DOHMH,Department of Health and Mental Hygiene,Rodent,Rat Sighting,3+ Family Apt. Building,11225,410 EASTERN PARKWAY,...,,,,,,,,40.67008593,-73.95476653,"(40.670085931603296, -73.95476653454561)"
3,38081444,1/1/2018 0:00,1/17/2018 17:59,DOHMH,Department of Health and Mental Hygiene,Rodent,Rat Sighting,3+ Family Apt. Building,10456,309 EAST 170 STREET,...,,,,,,,,40.83740983,-73.91033549,"(40.83740983152909, -73.91033549285669)"
4,38081433,1/1/2018 0:00,1/18/2018 19:56,DOHMH,Department of Health and Mental Hygiene,Rodent,Mouse Sighting,3+ Family Apt. Building,10455,610 TRINITY AVENUE,...,,,,,,,,40.81455949,-73.90960401,"(40.814559493267296, -73.90960400717243)"


In [3]:
#------------------------------------------------#
# Time to check out the data!
#------------------------------------------------#

# Get the counts
the311_pd.count()

Unique Key                        1048575
Created Date                      1048575
Closed Date                       1025984
Agency                            1048575
Agency Name                       1048575
Complaint Type                    1048575
Descriptor                        1037660
Location Type                      819131
Incident Zip                      1005834
Incident Address                   884184
Street Name                        884138
Cross Street 1                     578430
Cross Street 2                     575506
Intersection Street 1              126485
Intersection Street 2              125949
Address Type                      1021594
City                              1005945
Landmark                              202
Facility Type                      305683
Status                            1048575
Due Date                           431498
Resolution Description             984380
Resolution Action Updated Date    1036130
Community Board                   

In [4]:
# Check out the NaNs so you can remove them!
the311_pd.isnull().sum()

Unique Key                              0
Created Date                            0
Closed Date                         22591
Agency                                  0
Agency Name                             0
Complaint Type                          0
Descriptor                          10915
Location Type                      229444
Incident Zip                        42741
Incident Address                   164391
Street Name                        164437
Cross Street 1                     470145
Cross Street 2                     473069
Intersection Street 1              922090
Intersection Street 2              922626
Address Type                        26981
City                                42630
Landmark                          1048373
Facility Type                      742892
Status                                  0
Due Date                           617077
Resolution Description              64195
Resolution Action Updated Date      12445
Community Board                   

In [5]:
# Check out the data types, too!
the311_pd.dtypes

Unique Key                        object
Created Date                      object
Closed Date                       object
Agency                            object
Agency Name                       object
Complaint Type                    object
Descriptor                        object
Location Type                     object
Incident Zip                      object
Incident Address                  object
Street Name                       object
Cross Street 1                    object
Cross Street 2                    object
Intersection Street 1             object
Intersection Street 2             object
Address Type                      object
City                              object
Landmark                          object
Facility Type                     object
Status                            object
Due Date                          object
Resolution Description            object
Resolution Action Updated Date    object
Community Board                   object
BBL             

In [6]:
#------------------------------------------------#
# It's Cleaning Time, Coderella ♣♣♣
#------------------------------------------------#

# Make a DataFrame and only include the columns you need to use.

clean311_pd = the311_pd[['Unique Key', 
                          'Created Date', 
                          'Agency Name',
                          'Complaint Type',
                          'Descriptor',
                          'Location Type',
                          'Landmark',
                          'Borough',
                          'Incident Zip', 
                          'Latitude', 
                          'Longitude',
                          'Location']]

clean311_df = pd.DataFrame(clean311_pd)

# Check it out
clean311_df.isnull().sum()

Unique Key              0
Created Date            0
Agency Name             0
Complaint Type          0
Descriptor          10915
Location Type      229444
Landmark          1048373
Borough                 0
Incident Zip        42741
Latitude            71224
Longitude           71224
Location            71224
dtype: int64

In [7]:
# K, now exit all the NaNs and preview
clean311_df['Descriptor'].dropna()
clean311_df = clean311_df.dropna()
clean311_df.head()

Unnamed: 0,Unique Key,Created Date,Agency Name,Complaint Type,Descriptor,Location Type,Landmark,Borough,Incident Zip,Latitude,Longitude,Location
11648,38092317,1/2/2018 9:20,Department of Parks and Recreation,Maintenance or Facility,Structure - Indoors,Park,CENTRAL PARK,MANHATTAN,10000,40.78229521,-73.96519444,"(40.782295208794295, -73.96519444362545)"
25826,38096256,1/3/2018 13:47,Taxi and Limousine Commission,Taxi Complaint,Driver Complaint,Street,LA GUARDIA AIRPORT,QUEENS,11369,40.77442087,-73.87729411,"(40.77442086598845, -73.87729410513894)"
32297,38108295,1/4/2018 10:17,Taxi and Limousine Commission,Taxi Complaint,Driver Complaint,Street,LA GUARDIA AIRPORT,QUEENS,11369,40.77442087,-73.87729411,"(40.77442086598845, -73.87729410513894)"
63881,38139574,1/7/2018 11:17,Department of Parks and Recreation,Maintenance or Facility,Snow or Ice,Street/Curbside,ISHAM PARK,MANHATTAN,10034,40.87023791,-73.91972817,"(40.87023791438702, -73.91972816874608)"
72989,38151954,1/8/2018 10:16,Department of Transportation,Bus Stop Shelter Complaint,Snow Removal Requested,Bus Stop Shelter,KINGSBOROUGH COMMUNITY COLLEGE,BROOKLYN,11235,40.57790383,-73.93535012,"(40.577903827726615, -73.93535012227942)"


In [8]:
# Yay! It worked! Now, let's write the info into a new CSV
# clean311_df.to_csv("311Data.csv", encoding="utf-8", index=False)