# Análisis de datos de alquileres en Nueva York


In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read in the data
df = pd.read_csv('./ConjuntosDeDatos/AirBnB2clean.csv')

Ahora tengo los datos en df,hago un primer approach de visualización

In [22]:
# shape of df
print(df.shape)

# head of df
print(df.head())


(48895, 16)
      id                                               name  host_id  \
0  20739  Perfect 2-bedroom in the best part of Williams...    24864   
1  65164                                    Ditmas park lot    62595   
2  23090            Doorman Gym 2 Beds Luxury Building!5211    52328   
3  36673       seagate pravite house 5 mins away from beach     3184   
4  52600  Sunny & Clean Apt  ideal location E.Williamsburg!    43677   

        host_name neighbourhood_group neighbourhood  latitude  longitude  \
0        Angelina            BROOKLYN             -  40.71983  -73.96465   
1           Brian            Brooklyn      Flatbush  40.63809  -73.96445   
2  Jeremy & Laura           Manhattan   Murray Hill  40.74404  -73.97217   
3           Tural            BROOKLYN      Sea Gate  40.57531  -74.00518   
4          Shahar            BROOKLYN  Williamsburg  40.70684  -73.93854   

         room_type        price  minimum_nights  number_of_reviews  \
0  Entire home/apt        $ 

Ya con ver el head hay varios issues:
- Brooklyn aparece en minúscula y en mayúscula. 
- Los precios aparecen como $, Dollars o USD
- Hay datos sin el Neighborhood marcado
- Si no tiene reviews, reviews_per_month es NaN en vez de 0

In [23]:
#list all unique values in the column 'neighbourhood_group'
print(df.neighbourhood_group.unique())

['BROOKLYN' 'Brooklyn' 'Manhattan' 'MANHATTAN' 'BROKLYN/NY' 'QUEENS'
 'Manhattan/NY' 'Broo.' 'Manh.' 'Queens' 'BRONX' 'Bronx' 'Queens/NYC'
 'MANHATAN' 'STATEN ISLAND' 'Staten Island' 'ST. ISL.' 'Staten Isl.']


In [24]:
# 'BROOKLYN' 'Brooklyn' 'BROKLYN/NY' 'Broo.' should be fused
# 'Manhattan' 'MANHATTAN' 'Manhattan/NY' 'Manh.' 'MANHATAN' should be fused
# 'QUEENS' 'Queens' 'Queens/NYC' should be fused
# 'BRONX' 'Bronx' should be fused
# 'STATEN ISLAND' 'Staten Island' 'ST. ISL.' 'Staten Isl.' should be fused

# Replace the values in the column 'neighbourhood_group'
df.neighbourhood_group.replace(['BROOKLYN', 'Brooklyn', 'BROKLYN/NY', 'Broo.'], 'Brooklyn', inplace=True)
df.neighbourhood_group.replace(['Manhattan', 'MANHATTAN', 'Manhattan/NY', 'Manh.', 'MANHATAN'], 'Manhattan', inplace=True)
df.neighbourhood_group.replace(['QUEENS', 'Queens', 'Queens/NYC'], 'Queens', inplace=True)
df.neighbourhood_group.replace(['BRONX', 'Bronx'], 'Bronx', inplace=True)
df.neighbourhood_group.replace(['STATEN ISLAND', 'Staten Island', 'ST. ISL.', 'Staten Isl.'], 'Staten Island', inplace=True)

#list all unique values in the column 'neighbourhood_group'
print(df.neighbourhood_group.unique()) #sanity check

['Brooklyn' 'Manhattan' 'Queens' 'Bronx' 'Staten Island']


In [32]:
#list all unique values in the column 'neighbourhood' ordered alphabetically
print(df.neighbourhood.unique())

['-' 'Flatbush' 'Murray Hill' 'Sea Gate' 'Williamsburg'
 'Bedford-Stuyvesant' 'Chelsea' 'Park Slope' 'Fresh Meadows'
 "Hell's Kitchen" 'Upper West Side' 'West Village' 'SoHo'
 'Prospect-Lefferts Gardens' 'Chinatown' 'Bushwick' 'Ridgewood' 'Kips Bay'
 'Crown Heights' 'Gramercy' 'Upper East Side' 'South Slope'
 'Financial District' 'Prospect Heights' 'Elmhurst' 'Lower East Side'
 'Midtown' 'East New York' 'Greenpoint' 'Harlem' 'Gravesend'
 'Sheepshead Bay' 'East Village' 'Long Island City' 'NoHo' 'Fort Greene'
 'Midwood' 'Morris Park' 'Boerum Hill' 'South Ozone Park' 'Flushing'
 'East Harlem' 'Inwood' 'Sunset Park' 'Pelham Gardens' 'Astoria'
 'Sunnyside' 'Morris Heights' 'Brownsville' 'East Elmhurst' 'Civic Center'
 'Washington Heights' 'Bay Ridge' 'Clinton Hill' 'Wakefield'
 'East Flatbush' 'Brooklyn Heights' 'Morningside Heights'
 'Theater District' 'Tribeca' 'Middle Village' 'Canarsie' 'Kingsbridge'
 'Bellerose' 'Kensington' 'Cypress Hills' 'Windsor Terrace' 'Flatlands'
 'Jamaica' 'Li

There seem to be no coincidences in the 221 neighborhoods.

In [25]:
#print some unique values in the price column
print(df.price.unique()[40:100])

['230' '$ 41' 'USD 85' '200 Do.' '$ 38' '59 Dollars' 'USD 119' '74'
 'USD 198' 'USD 129' '125' 'USD 35' '50 Dollars' '$274' 'USD 125' '175'
 '220 Dollars' '140' '241 Dollars' '$ 35' 'USD 123' '350 Dollars'
 '195 Do.' '199 Do.' 'USD 200' '$225' 'USD 325' '225 Dollars' '70 Do.'
 'USD 890' '$475' '250' '96 Dollars' '46 Dollars' '$ 90' '375 Dollars'
 '$165' '55 Dollars' '190 Dollars' '$155' '$40' '105 Dollars' '$ 255'
 '$280' 'USD 130' '53' '20' 'USD 300' '75 Dollars' '299' '$ 737' '$72'
 'USD 134' '70 Dollars' 'USD 195' '65 Do.' '269' '$99' '200' '$ 95']


In [26]:
# now fix the issue with dollar signs, retaining only the numeric values
df.price = df.price.str.replace('[^\d.]', '', regex=True) #remove all non-numeric characters
df.price = df.price.astype(float)

# sanity check
print(df.price.unique()[40:100])


[  74.  198.  129.   35.  274.  175.  220.  140.  241.  123.  350.  195.
  325.  890.  475.   96.   46.  375.  165.   55.  190.  155.  105.  255.
  280.  130.   53.   20.  737.   72.  134.   65.  269.   95.  379.  287.
   51.  100.   36.   12.   49.  170.  160.  172.  399.  189.   52.  112.
  124.  117.  187.  289.  136.  159. 6419.  550.  151.  347.  227.  239.]


In [28]:
# change all NaN in reviews_per_month to 0
df.reviews_per_month.fillna(0, inplace=True)

# sanity check
print(df.reviews_per_month.unique()[:10])

[0.   0.07 0.15 3.88 4.69 0.22 0.21 0.95 1.22 0.16]


With this the database is correctly formatted and we can start processing