In [1]:
import pandas as pd
import numpy as np

In [35]:
pd.set_option('max_columns', None)

In [2]:
# Open listings data
boston_listings = pd.read_csv("boston_airbnb_data/listings.csv")
seattle_listings = pd.read_csv("seattle_airbnb_data/listings.csv")

In [3]:
# Quick explore of number of rows and columns of calendar data
print("Boston calendar data have {} columns and {} rows and Seattle calendar data have {} columns and {} rows".format(str(boston_listings.shape[1]),str(boston_listings.shape[0]),str(seattle_listings.shape[1]),str(seattle_listings.shape[0])))

Boston calendar data have 95 columns and 3585 rows and Seattle calendar data have 92 columns and 3818 rows


####  Compare both data frames to detect differences and be able to take decisions

In [17]:
# Columns and type for boston and seattle listings
boston_col_type = boston_listings.dtypes.to_dict()
seattle_col_type = seattle_listings.dtypes.to_dict()

# Compares columns and columns type to see if they are equivalent or there are differences between them

nonshared_columns = []
nonshared_types = {}

for boston_col, boston_type in boston_col_type.items():
    found_indicator = 0
    for seattle_col, seattle_type in seattle_col_type.items():
        if boston_col == seattle_col and boston_type == seattle_type:
            found_indicator = 1
            break
        elif boston_col == seattle_col and boston_type != seattle_type:
            nonshared_types[boston_col] = {"in_df_boston":boston_type,"in_df_seattle":seattle_type}
            found_indicator = 1
            break
    if found_indicator == 0:
        nonshared_columns.append(boston_col)
    
            

##### Explore non-shared columns

In [18]:
# Show columns that exist in boston df but not in seattle df
nonshared_columns

['access', 'interaction', 'house_rules']

In [29]:
boston_listings[['access', 'interaction', 'house_rules']].describe()

Unnamed: 0,access,interaction,house_rules
count,2096,2031,2393
unique,1762,1617,1928
top,"You have access to the entire apartment, and a...",Need more towels? A restaurant recommendation?...,House Rules 1. Check-in is 4 pm local time. If...
freq,48,58,44


##### Explore columns don't share dtype

In [30]:
# Show columns that exist in both df but they don't share dtype
nonshared_types

{'host_listings_count': {'in_df_boston': dtype('int64'),
  'in_df_seattle': dtype('float64')},
 'host_total_listings_count': {'in_df_boston': dtype('int64'),
  'in_df_seattle': dtype('float64')},
 'neighbourhood_group_cleansed': {'in_df_boston': dtype('float64'),
  'in_df_seattle': dtype('O')},
 'has_availability': {'in_df_boston': dtype('float64'),
  'in_df_seattle': dtype('O')},
 'jurisdiction_names': {'in_df_boston': dtype('float64'),
  'in_df_seattle': dtype('O')}}

In [27]:
boston_listings[["neighbourhood_group_cleansed","has_availability","jurisdiction_names"]].describe()

Unnamed: 0,neighbourhood_group_cleansed,has_availability,jurisdiction_names
count,0.0,0.0,0.0
mean,,,
std,,,
min,,,
25%,,,
50%,,,
75%,,,
max,,,


In [28]:
seattle_listings[["neighbourhood_group_cleansed","has_availability","jurisdiction_names"]].describe()

Unnamed: 0,neighbourhood_group_cleansed,has_availability,jurisdiction_names
count,3818,3818,3818
unique,17,1,1
top,Other neighborhoods,t,WASHINGTON
freq,794,3818,3818


##### Working with data from Boston and Seattle

Earasing columns ['access', 'interaction', 'house_rules'] from boston listing dataframe we can have the same 

In [44]:
# Drop non-shared columns
udpate_boston_listings = boston_listings.drop( ['access', 'interaction', 'house_rules'], axis=1)

In [59]:
# Gets boston and seattle columns names in a individual list
boston_cols = list(udpate_boston_listings.columns)
seattle_cols = list(seattle_listings.columns)

In [74]:
# Check if indeed both df already have the same number of columns
print("Number of columns in boston df: {} and number of columns in seattle df: {}".format(len(boston_cols),len(seattle_cols)))

Number of columns in boston df: 92 and number of columns in seattle df: 92


In [73]:
# Check columns have the same position in both df
all_same_position = "Yes"
for position in range(len(boston_cols)):
    if boston_cols[position] != seattle_cols[position]:
        all_same_position = "No"

print("¿Columns have the same position in both df? {}".format(all_same_position))

¿Columns have the same position in both df? Yes


After the previous analysis we can conclude that it is useful to concat both dataframes for the following reasons:

1. Dataframes do not share only 3 columns that do not contain useful information for our case
2. Dropping non-shared columns leave us with two dataframes with the same columns in the same position
3. Five columns do no share type, two due to difference in the interpretation between int and float, which is easily solved, and three because boston dataframe only has null values in those columns
4. Concatenating the data from Boston and Seattle gives us a larger sample in which the city becomes another variable to analyze.