IMPORT PACKAGES

In [10]:
import requests
import pandas as pd

PULL DATA FROM SOCRATA API

In [2]:
offset = 0
LIMIT = 1000000 #max rows which can be pulled from the api with a single call
arr_of_df = [] #array which will hold multiple batches of data
URL_CONSTANT = "https://data.cityofnewyork.us/resource/fhrw-4uyv.json?$WHERE=created_date between '2017-01-01T00:00:00.000' and '2017-12-31T23:59:59.999' &$limit=" + str({}) + " &$offset=" + str({})

print("loading in data...")

#pull data from api in batches of 1000000 (limit) until theres no more rows to pull
while True:
    apiData = requests.get(URL_CONSTANT.format(LIMIT, offset)).json()
    dfTemp = pd.DataFrame(data=apiData) #convert each batch of data into a dataframe so we can easily use the pandas concat method later
    arr_of_df.append(dfTemp)
    offset += LIMIT
    if (len(apiData)-LIMIT) >= 0: #check if the max limit was pulled from the api
        continue
    else: #near end of dataset, no need to send another request
        break
        
#concatenate all of the batches of data into one DataFrame
df_cityData = pd.concat(arr_of_df)

#confirm that all 2017 records are in the dataframe
#df_cityData['created_date'].min() = '2017-01-01T00:00:00.000'
#df_cityData['created_date'].max() = '2017-12-31T23:59:35.000'
print("COMPLETE!")
print("#rows in dataframe: " + str(len(df_cityData)))

loading in data...
COMPLETE!
#rows in dataframe: 2461158


##Consider only the 10 most common overall complaint types. For each borough, how many of each of those 10 types were there in 2017?

FILTER OUT THE TOP 10 COMPLAINT TYPES

In [12]:
df_topTenComplaintTypes = df_cityData['complaint_type'].value_counts().head(10)
df_topTenComplaintTypes

Noise - Residential        230152
HEAT/HOT WATER             213521
Illegal Parking            146122
Blocked Driveway           136097
Street Condition            93265
Street Light Condition      84195
UNSANITARY CONDITION        79282
Noise - Street/Sidewalk     73085
Water System                65101
Noise                       60171
Name: complaint_type, dtype: int64

FILTER THE CITY DATA FURTHER USING THE ABOVE RESULTS

In [14]:
#filter the records for which the complaint type matches one found in 'df_topTenComplaintTypes'
df_TopTenData = df_cityData.loc[df_cityData['complaint_type'].isin(df_topTenComplaintTypes.index)]
#filter the records to include only the 'borough' & 'complaint_type' (multiple for each borough), and the number of occurences for each complaint within that borough
df_TopTenData[['borough','complaint_type']].groupby(['borough','complaint_type'])['complaint_type'].count()

borough        complaint_type         
BRONX          Blocked Driveway           24574
               HEAT/HOT WATER             68718
               Illegal Parking            16122
               Noise                       3134
               Noise - Residential        57663
               Noise - Street/Sidewalk    14025
               Street Condition           11761
               Street Light Condition     18410
               UNSANITARY CONDITION       24561
               Water System               10221
BROOKLYN       Blocked Driveway           49302
               HEAT/HOT WATER             66984
               Illegal Parking            55380
               Noise                      15421
               Noise - Residential        67629
               Noise - Street/Sidewalk    21313
               Street Condition           25432
               Street Light Condition     22458
               UNSANITARY CONDITION       26659
               Water System               19809
M

##Consider only the 10 most common overall complaint types.  For the 10 most populous zip codes, how many of each of those 10 types were there in 2017?

READ IN THE NEW DATASET & MERGE THE POPULATION DATA WITH THE CITY DATA

In [24]:
#read in the dataset containing the population data
df_populationData = pd.read_csv('PopulationData.csv')

In [25]:
#merge the two dataframes
df_populationData.columns = ['incident_zip','population'] #rename columns to match other df
df_populationData['incident_zip'] = df_populationData['incident_zip'].apply(str)#change column type to string to handle 'NaN' in df_zipAndComplaintType
df_zipAndComplaintType = df_cityData[['incident_zip','complaint_type']] #grab necessary data from df_cityData
df_mergedData = pd.merge(df_populationData, df_zipAndComplaintType, on='incident_zip')

APPLY FILTERING AND GRAB THE NECESSARY DATA

In [27]:
df_overlappingZips = df_mergedData.loc[df_mergedData['incident_zip'].isin(df_cityData['incident_zip'])] #filter for overlapping zip codes
df_overlappingZips = df_overlappingZips.loc[df_overlappingZips['complaint_type'].isin(df_topTenComplaintTypes.index)] #filter for top ten complaints
df_overlappingZips = df_overlappingZips.groupby(['incident_zip','population','complaint_type'])['complaint_type'].count()
df_overlappingZips = df_overlappingZips.to_frame('').sort_values(by='population', ascending=False).head(100) #sort and grab first 100 (10zips x 10complaints = 100rows)
print(df_overlappingZips.to_string()) #print ALL data rows 

                                                     
incident_zip population complaint_type               
11368        109931     UNSANITARY CONDITION      639
                        Street Light Condition    444
                        Blocked Driveway         4384
                        HEAT/HOT WATER           1620
                        Illegal Parking          1251
                        Noise                     158
                        Noise - Residential      2460
                        Street Condition          561
                        Noise - Street/Sidewalk   684
                        Water System              617
11226        101572     Noise - Residential      4854
                        Blocked Driveway         2203
                        Water System              406
                        Illegal Parking          1076
                        Noise                     440
                        HEAT/HOT WATER           7569
                        Nois

##Considering all complaint types. Which boroughs are the biggest "complainers" relative to the size of the population in 2017? Meaning, calculate a complaint-index that adjusts for population of the borough.

GRAB THE TOTAL # OF COMPLAINTS FOR EACH BOROUGH

In [28]:
df_complaintCountPerBorough = df_cityData[['borough','complaint_type']].groupby(['borough'])['complaint_type'].count().reset_index()
df_complaintCountPerBorough.rename(columns = {'complaint_type': 'complaint_count'}, inplace=True)
df_complaintCountPerBorough

Unnamed: 0,borough,complaint_count
0,BRONX,450933
1,BROOKLYN,771322
2,MANHATTAN,480314
3,QUEENS,589971
4,STATEN ISLAND,127136
5,Unspecified,41482


GRAB THE TOTAL POPULATION OF EACH BOROUGH

In [29]:
#Grab necessary data from df_cityData
df_zipBorough = df_cityData[['borough','incident_zip']]
df_zipBorough = df_zipBorough.replace(['0', 'N/A', 'UNKNOWN', 'NA','.',''], pd.np.nan).dropna().drop_duplicates() #filter out 'bad' rows

#Grab necessary data from df_populationData
df_zipPop = df_populationData[['incident_zip','population']]

#Merge and manipulate data to get the total population value for each borough
df_populationOfBoroughs = pd.merge(df_zipBorough, df_zipPop, on='incident_zip')
df_populationOfBoroughs = df_populationOfBoroughs.sort_values(by='borough')
df_populationOfBoroughs = df_populationOfBoroughs.groupby(['borough'])['population'].sum().reset_index()
df_populationOfBoroughs

Unnamed: 0,borough,population
0,BRONX,1592084
1,BROOKLYN,2732303
2,MANHATTAN,2073659
3,QUEENS,2755424
4,STATEN ISLAND,468730
5,Unspecified,13762538


MERGE THE TWO TABLES SHOWN ABOVE

In [30]:
df_biggestComplainers = pd.merge(df_complaintCountPerBorough, df_populationOfBoroughs, on='borough')
#calculate complaint index (complaint_count/population) for each borough
df_biggestComplainers['complaint_index'] = df_biggestComplainers['complaint_count']/df_biggestComplainers['population']
df_biggestComplainers.sort_values(by='complaint_index', ascending=False) #sort by biggest "complainers"

Unnamed: 0,borough,complaint_count,population,complaint_index
0,BRONX,450933,1592084,0.283234
1,BROOKLYN,771322,2732303,0.282297
4,STATEN ISLAND,127136,468730,0.271235
2,MANHATTAN,480314,2073659,0.231626
3,QUEENS,589971,2755424,0.214113
5,Unspecified,41482,13762538,0.003014


RESULT: Based on the above data, we can easily see that people from the Bronx are the biggest "complainers", with an average of around 1 complaint per 3.5 people. 