In [1]:
import pandas as pd
import numpy as np

## Amenities table for US data

#### Goals for this step

- Using the same process as the amenities table notebook, we will create a table for listings in the US with amenities as columns, 1 indicating it is present and 0 it is not.

In [2]:
df = pd.read_pickle("C:/Users/Admin/Documents/ironhack/AirBnB_data/airbnb_usa_cities.pkl")

In [3]:
df["Amenities"]

0         Internet;Wireless Internet;Air conditioning;Wh...
1         Internet;Wireless Internet;Air conditioning;He...
2                    Wireless Internet;Heating;Washer;Dryer
3         TV;Internet;Wireless Internet;Air conditioning...
4         Wireless Internet;Kitchen;Free Parking on Prem...
                                ...                        
105275    TV;Cable TV;Internet;Wireless Internet;Air con...
105276    TV;Cable TV;Internet;Wireless Internet;Air con...
105277    TV;Cable TV;Internet;Wireless Internet;Air con...
105278    TV;Cable TV;Internet;Wireless Internet;Air con...
105279    TV;Cable TV;Internet;Wireless Internet;Air con...
Name: Amenities, Length: 105280, dtype: object

In [4]:
amenities = df["Amenities"].str.split(";")

In [5]:
amenities_total = []

for amenity in amenities:
    for i in range(len(amenity)):
        amenities_total.append(amenity[i])
        
amenities_total = list(set(amenities_total))
amenities_total.sort()
amenities_total

[' smooth pathway to front door',
 '24-Hour Check-in',
 '24-hour check-in',
 'Accessible-height bed',
 'Accessible-height toilet',
 'Air Conditioning',
 'Air conditioning',
 'Air purifier',
 'BBQ grill',
 'Baby bath',
 'Baby monitor',
 'Babysitter recommendations',
 'Bath towel',
 'Bathtub',
 'Bathtub with shower chair',
 'Beach essentials',
 'Beachfront',
 'Bed linens',
 'Body soap',
 'Breakfast',
 'Buzzer/Wireless Intercom',
 'Buzzer/wireless intercom',
 'Cable TV',
 'Carbon Monoxide Detector',
 'Carbon monoxide detector',
 'Cat(s)',
 'Changing table',
 'Children�s books and toys',
 'Children�s dinnerware',
 'Cleaning before checkout',
 'Coffee maker',
 'Cooking basics',
 'Crib',
 'Disabled parking spot',
 'Dishes and silverware',
 'Dishwasher',
 'Dog(s)',
 'Doorman',
 'Doorman Entry',
 'Dryer',
 'EV charger',
 'Elevator',
 'Elevator in Building',
 'Elevator in building',
 'Essentials',
 'Ethernet connection',
 'Extra pillows and blankets',
 'Family/Kid Friendly',
 'Family/kid friend

We will normalize the amenities by having a list of all amenities in lower case.

In [6]:
amenities_lower = []

for i in amenities:
    amenities_lower.append([x.lower() for x in i])

In [7]:
amenities_count = []

for amenity in amenities_lower:
    for i in range(len(amenity)):
        amenities_count.append(amenity[i])

We will create a data frame in order to count the values and get the top 100.

In [8]:
amenities_df = pd.DataFrame(amenities_count)[0].value_counts()

In [9]:
amenities_df

wireless internet                            102904
heating                                       98500
kitchen                                       96692
essentials                                    93793
smoke detector                                91393
                                              ...  
hand or paper towel                               1
roll-in shower with shower bench or chair         1
bath towel                                        1
hand soap                                         1
toilet paper                                      1
Name: 0, Length: 132, dtype: int64

Unknown amenities will be removed.

In [10]:
amenities_df = amenities_df.drop(index=["translation missing: en.hosting_amenity_49"
                                        , "translation missing: en.hosting_amenity_50"
                                       , "other"
                                       , "other pet(s)"])

In [11]:
amenities_df = amenities_df.head(100)

In [12]:
amenities_top = amenities_df.reset_index().rename(columns={"index":"amenity"})["amenity"]

The amenities table for the us will be created.

In [13]:
amenities_df = pd.DataFrame(columns=(amenities_top), index=range(df.shape[0]))

In [14]:
amenities_df

amenity,wireless internet,heating,kitchen,essentials,smoke detector,air conditioning,tv,shampoo,hangers,carbon monoxide detector,...,cleaning before checkout,fireplace guards,pocket wifi,changing table,hot water kettle,handheld shower head,wide clearance to shower & toilet,baby monitor,firm mattress,private bathroom
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105275,,,,,,,,,,,...,,,,,,,,,,
105276,,,,,,,,,,,...,,,,,,,,,,
105277,,,,,,,,,,,...,,,,,,,,,,
105278,,,,,,,,,,,...,,,,,,,,,,


We will apply the following function in order to locate 1s and 0s as values.

In [15]:
rows = range(df.shape[0])

def amenities_func(row, df, amenities_lower):
    for i in amenities_lower[row]:
        if i in df.columns:
            df.loc[row, i] = 1
        else:
            continue
    return df

In [16]:
for i in rows:
    amenities_df = amenities_func(i, amenities_df, amenities_lower)

In [17]:
amenities_df

amenity,wireless internet,heating,kitchen,essentials,smoke detector,air conditioning,tv,shampoo,hangers,carbon monoxide detector,...,cleaning before checkout,fireplace guards,pocket wifi,changing table,hot water kettle,handheld shower head,wide clearance to shower & toilet,baby monitor,firm mattress,private bathroom
0,1,1,1,1,1,1,,1,1,1,...,,,,,,,,,,
1,1,1,,,1,1,,,1,1,...,,,,,,,,,,
2,1,1,,,,,,,,,...,,,,,,,,,,
3,1,1,1,1,1,1,1,1,1,,...,,,,,,,,,,
4,1,1,1,1,1,,,1,1,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105275,1,1,1,1,1,1,1,1,1,,...,,,,,,,,,,
105276,1,1,1,1,1,1,1,1,1,1,...,,,,,,,,,,
105277,1,1,1,1,1,1,1,1,1,1,...,,,,,,,,,,
105278,1,1,1,1,1,1,1,1,1,1,...,,,,,,,,,,


We will change null values to 0.

In [18]:
amenities_df = amenities_df.fillna(value=0)

In [19]:
amenities_df.head()

amenity,wireless internet,heating,kitchen,essentials,smoke detector,air conditioning,tv,shampoo,hangers,carbon monoxide detector,...,cleaning before checkout,fireplace guards,pocket wifi,changing table,hot water kettle,handheld shower head,wide clearance to shower & toilet,baby monitor,firm mattress,private bathroom
0,1,1,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


We will add city and review score columns.

In [20]:
amenities_df["city"] = df["City"]

In [27]:
amenities_df["review_score"] = df["Review Scores Rating"]

In [28]:
amenities_df.head()

amenity,wireless internet,heating,kitchen,essentials,smoke detector,air conditioning,tv,shampoo,hangers,carbon monoxide detector,...,pocket wifi,changing table,hot water kettle,handheld shower head,wide clearance to shower & toilet,baby monitor,firm mattress,private bathroom,city,review_score
0,1,1,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,San Francisco,92.0
1,1,1,0,0,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,Los Angeles,95.0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,San Francisco,100.0
3,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,Los Angeles,90.0
4,1,1,1,1,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,San Diego,95.0


In [29]:
# amenities_df.to_pickle("C:/Users/Admin/Documents/ironhack/AirBnB_data/airbnb_amenities_usa.pkl")