## Haris Sumra
## Housing Violation Dataset
## 10/27/2019

### In this Project:
* Usin pandas to ingest and select some data from my final project dataset
* Clean up the data; drop any unnecessary columns along with renaming the columns. 
* Data was collected from (https://data.cityofnewyork.us/resource/wvxf-dwi5.csv)

### Importing all the libraries that I will be using in this module

In [20]:
#Dependencies
import pandas as pd
import numpy as np

### Extracting the data from the website listed below, only using up to 5000 rows for now.

In [21]:
#Get the data
df = pd.read_csv("https://data.cityofnewyork.us/resource/wvxf-dwi5.csv?$limit=5000")
df.shape

(5000, 40)

In [22]:
df.head(5)

Unnamed: 0,violationid,buildingid,registrationid,boroid,boro,housenumber,lowhousenumber,highhousenumber,streetname,streetcode,...,novtype,violationstatus,latitude,longitude,communityboard,councildistrict,censustract,bin,bbl,nta
0,10000009,265980,301467,3,BROOKLYN,355,355,355,EAST 48 STREET,36930,...,Original,Close,40.653217,-73.93248,17,41,870,3102071.0,3046740000.0,East Flatbush-Farragut
1,10000011,73852,226626,2,BRONX,1123,1123,1123,EAST TREMONT AVENUE,29620,...,Original,Close,40.839954,-73.876599,6,15,220,2042428.0,2040040000.0,East Tremont
2,10000012,80102,211704,2,BRONX,751,751,751,GERARD AVENUE,35020,...,Original,Close,40.824692,-73.926605,4,8,63,2002971.0,2024820000.0,West Concourse
3,10000014,268639,350942,3,BROOKLYN,54,54,54,EAST 52 STREET,37080,...,Original,Close,40.659731,-73.929295,17,41,878,3099672.0,3046050000.0,Prospect Lefferts Gardens-Wingate
4,10000017,268167,300713,3,BROOKLYN,146,146,146,EAST 52 STREET,37080,...,Original,Close,40.657615,-73.929067,17,41,878,3100214.0,3046210000.0,Prospect Lefferts Gardens-Wingate


### Checking to see which columns are there, and which columns needs "renaming" or be "dropped"

In [23]:
df.columns

Index(['violationid', 'buildingid', 'registrationid', 'boroid', 'boro',
       'housenumber', 'lowhousenumber', 'highhousenumber', 'streetname',
       'streetcode', 'zip', 'apartment', 'story', 'block', 'lot', 'class',
       'inspectiondate', 'approveddate', 'originalcertifybydate',
       'originalcorrectbydate', 'newcertifybydate', 'newcorrectbydate',
       'certifieddate', 'ordernumber', 'novid', 'novdescription',
       'novissueddate', 'currentstatusid', 'currentstatus',
       'currentstatusdate', 'novtype', 'violationstatus', 'latitude',
       'longitude', 'communityboard', 'councildistrict', 'censustract', 'bin',
       'bbl', 'nta'],
      dtype='object')

### Dropping few columns that are not needed to clean by the data, in this scenario we are dropping by columns.

In [24]:
df_2 = df.drop(df.columns[[1,2,6,7,9,12]], axis=1)
df_2.head()

Unnamed: 0,violationid,boroid,boro,housenumber,streetname,zip,apartment,block,lot,class,...,novtype,violationstatus,latitude,longitude,communityboard,councildistrict,censustract,bin,bbl,nta
0,10000009,3,BROOKLYN,355,EAST 48 STREET,11203,,4674,64,B,...,Original,Close,40.653217,-73.93248,17,41,870,3102071.0,3046740000.0,East Flatbush-Farragut
1,10000011,2,BRONX,1123,EAST TREMONT AVENUE,10460,,4004,9,B,...,Original,Close,40.839954,-73.876599,6,15,220,2042428.0,2040040000.0,East Tremont
2,10000012,2,BRONX,751,GERARD AVENUE,10451,3L,2482,30,B,...,Original,Close,40.824692,-73.926605,4,8,63,2002971.0,2024820000.0,West Concourse
3,10000014,3,BROOKLYN,54,EAST 52 STREET,11203,,4605,19,B,...,Original,Close,40.659731,-73.929295,17,41,878,3099672.0,3046050000.0,Prospect Lefferts Gardens-Wingate
4,10000017,3,BROOKLYN,146,EAST 52 STREET,11203,,4621,21,B,...,Original,Close,40.657615,-73.929067,17,41,878,3100214.0,3046210000.0,Prospect Lefferts Gardens-Wingate


### Let's drop the columns using "dropna" command for any row/column that has "na" values

In [25]:
drop_by_name = df_2.dropna()
drop_by_name.head()

Unnamed: 0,violationid,boroid,boro,housenumber,streetname,zip,apartment,block,lot,class,...,novtype,violationstatus,latitude,longitude,communityboard,councildistrict,censustract,bin,bbl,nta
76,10000394,2,BRONX,2320,BRONX PARK EAST,10467,2F,4340,25,B,...,Original,Close,40.860036,-73.870734,11,15,33201,2049747.0,2043400000.0,Bronxdale
1069,10003633,3,BROOKLYN,170,NEW YORK AVENUE,11216,2,1227,44,C,...,Original,Close,40.674488,-73.947294,8,36,31701,3031005.0,3012270000.0,Crown Heights North
1070,10003634,3,BROOKLYN,170,NEW YORK AVENUE,11216,2,1227,44,C,...,Original,Close,40.674488,-73.947294,8,36,31701,3031005.0,3012270000.0,Crown Heights North
1071,10003635,3,BROOKLYN,170,NEW YORK AVENUE,11216,2,1227,44,C,...,Original,Close,40.674488,-73.947294,8,36,31701,3031005.0,3012270000.0,Crown Heights North
1072,10003636,3,BROOKLYN,170,NEW YORK AVENUE,11216,2,1227,44,C,...,Original,Close,40.674488,-73.947294,8,36,31701,3031005.0,3012270000.0,Crown Heights North


### Renaming columns to clean up the column names for better appearance

In [26]:
df_3 = df_2.rename(columns={'violationid': "ViolationID",'boroid': "BoroughID",'boro':"Borough", "housenumber":" House Number",
                           "streetname":'Street Name','zip':'Zipcode'})
df_3.head()

Unnamed: 0,ViolationID,BoroughID,Borough,House Number,Street Name,Zipcode,apartment,block,lot,class,...,novtype,violationstatus,latitude,longitude,communityboard,councildistrict,censustract,bin,bbl,nta
0,10000009,3,BROOKLYN,355,EAST 48 STREET,11203,,4674,64,B,...,Original,Close,40.653217,-73.93248,17,41,870,3102071.0,3046740000.0,East Flatbush-Farragut
1,10000011,2,BRONX,1123,EAST TREMONT AVENUE,10460,,4004,9,B,...,Original,Close,40.839954,-73.876599,6,15,220,2042428.0,2040040000.0,East Tremont
2,10000012,2,BRONX,751,GERARD AVENUE,10451,3L,2482,30,B,...,Original,Close,40.824692,-73.926605,4,8,63,2002971.0,2024820000.0,West Concourse
3,10000014,3,BROOKLYN,54,EAST 52 STREET,11203,,4605,19,B,...,Original,Close,40.659731,-73.929295,17,41,878,3099672.0,3046050000.0,Prospect Lefferts Gardens-Wingate
4,10000017,3,BROOKLYN,146,EAST 52 STREET,11203,,4621,21,B,...,Original,Close,40.657615,-73.929067,17,41,878,3100214.0,3046210000.0,Prospect Lefferts Gardens-Wingate


In [27]:
is_null = df[df['bin'].isnull()]