In [1]:
import pandas as pd
import numpy as np

## Data Cleaning Step 3
#### Goals for this step
- Verify data types and change them if necessary
- Verify list of countries has unique values for each country
- Create individual data frames for countries, USA, and one with amenities as columns, 1 indicating True and 0 indicating False

In [2]:
df = pd.read_csv("C:/Users/Admin/Documents/ironhack/AirBnB_data/airbnb_listings_amenity_count.csv", low_memory=False)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Listing ID,Name,Host ID,Host Name,Host Response Rate,Host Is Superhost,Host total listings count,Street,City,...,Review Scores Rating,Review Scores Accuracy,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Review Scores Value,Reviews per month,Amenities_List,Amenity_Count
0,0,5534229,A 2 Passi da San Pietro,28697142,Veronica,100%,False,5.0,00165| Rm 00165| Italy,165,...,90,9.0,10.0,8.0,8.0,9.0,9.0,0.08,"['TV', 'Internet', 'Wireless Internet', 'Air c...",11
1,1,5903406,cosy small apartment,1853799,Veronika,88%,False,2.0,1190| Wien| Austria,1190,...,87,9.0,10.0,10.0,10.0,10.0,8.0,0.27,"['Internet', 'Wireless Internet', 'Kitchen', '...",12
2,2,5203533,Rummelig lejl i hjertet af KBH,24801534,Marianne,100%,False,1.0,Indre By| 1366| Danmark K�benhavn K| Denmark,1366,...,100,10.0,10.0,10.0,10.0,10.0,10.0,0.28,"['TV', 'Wireless Internet', 'Kitchen', 'Buzzer...",8
3,3,8858475,Modern; spacious and warm; with its own balcony,3000397,Lasse,100%,True,1.0,2150| Danmark Nordhavn| Denmark,2150,...,100,10.0,10.0,10.0,10.0,9.0,10.0,0.79,"['TV', 'Cable TV', 'Internet', 'Wireless Inter...",23
4,4,4215511,Nice room; Bohemian Cph; N�rrebro,126020,Lea,100%,True,2.0,N�rrebro| 2200| K�benhavn N 2200| Denmark,2200,...,94,10.0,10.0,9.0,10.0,10.0,10.0,1.4,"['Internet', 'Wireless Internet', 'Kitchen', '...",10


In [4]:
df.dtypes

Unnamed: 0                       int64
Listing ID                       int64
Name                            object
Host ID                          int64
Host Name                       object
Host Response Rate              object
Host Is Superhost               object
Host total listings count      float64
Street                          object
City                            object
Neighbourhood cleansed          object
State                           object
Country                         object
latitude                       float64
longitude                      float64
Property type                   object
Room type                       object
Accommodates                     int64
Bathrooms                      float64
Bedrooms                       float64
Amenities                       object
Price                          float64
Minimum nights                   int64
Maximum nights                   int64
Availability 365                 int64
Calendar last scraped    

In [5]:
df = df.drop(columns="Unnamed: 0")

In [6]:
df.shape

(401816, 37)

### Transforming Listing ID and Host ID to string

In [7]:
df["Listing ID"] = df["Listing ID"].astype("str")

In [8]:
df["Host ID"] = df["Host ID"].astype("str")

### Transforming Host Response Rate and Review Scores Ratings to float


In [9]:
df["Host Response Rate"] = df["Host Response Rate"].str.strip("%").astype("float")

In [10]:
df = df.loc[df["Review Scores Rating"].notna()]

In [11]:
df = df.loc[df["Review Scores Rating"] != "3/14/05"]


In [12]:
df["Review Scores Rating"] = df["Review Scores Rating"].astype("float")

### Transforming Host is Superhost to boolean

In [13]:
df["Host Is Superhost"] = df["Host Is Superhost"].astype("bool")

### Transforming Calendar last scraped and Last Review Date to datetime

In [14]:
df["Calendar last scraped"] = pd.to_datetime(df["Calendar last scraped"])

In [15]:
df["Last Review Date"] = pd.to_datetime(df["Last Review Date"])

In [16]:
df.dtypes

Listing ID                             object
Name                                   object
Host ID                                object
Host Name                              object
Host Response Rate                    float64
Host Is Superhost                        bool
Host total listings count             float64
Street                                 object
City                                   object
Neighbourhood cleansed                 object
State                                  object
Country                                object
latitude                              float64
longitude                             float64
Property type                          object
Room type                              object
Accommodates                            int64
Bathrooms                             float64
Bedrooms                              float64
Amenities                              object
Price                                 float64
Minimum nights                    

### Verifying Countries' Column

In [17]:
set(df["Country"])

{'Australia',
 'Austria',
 'Belgium',
 'Canada',
 'China',
 'Denmark',
 'France',
 'Germany',
 'Greece',
 'Hong Kong',
 'Ireland',
 'Italy',
 'Mexico',
 'Netherlands',
 'Spain',
 'Switzerland',
 'United Kingdom',
 'United States',
 'Uruguay',
 'Vanuatu',
 'Vatican City'}

### Exporting final clean datas to a pickle

1. Countries data set

In [18]:
df.columns

Index(['Listing ID', 'Name', 'Host ID', 'Host Name', 'Host Response Rate',
       'Host Is Superhost', 'Host total listings count', 'Street', 'City',
       'Neighbourhood cleansed', 'State', 'Country', 'latitude', 'longitude',
       'Property type', 'Room type', 'Accommodates', 'Bathrooms', 'Bedrooms',
       'Amenities', 'Price', 'Minimum nights', 'Maximum nights',
       'Availability 365', 'Calendar last scraped', 'Number of reviews',
       'Last Review Date', 'Review Scores Rating', 'Review Scores Accuracy',
       'Review Scores Cleanliness', 'Review Scores Checkin',
       'Review Scores Communication', 'Review Scores Location',
       'Review Scores Value', 'Reviews per month', 'Amenities_List',
       'Amenity_Count'],
      dtype='object')

In [19]:
df_countries = df[['Listing ID', 'Name', 'Host ID', 'Host Name', 'Host Response Rate'
                  ,'Host Is Superhost', 'Host total listings count', 'Country', 'latitude'
                  , 'longitude','Property type', 'Room type', 'Accommodates', 'Bathrooms', 'Bedrooms'
                  , 'Amenities', 'Price', 'Minimum nights', 'Maximum nights', 'Availability 365'
                  , 'Calendar last scraped', 'Number of reviews', 'Last Review Date', 'Review Scores Rating'
                  , 'Review Scores Accuracy', 'Review Scores Cleanliness', 'Review Scores Checkin'
                  , 'Review Scores Communication', 'Review Scores Location', 'Review Scores Value'
                  , 'Reviews per month', 'Amenity_Count']]

In [20]:
df_countries.head()

Unnamed: 0,Listing ID,Name,Host ID,Host Name,Host Response Rate,Host Is Superhost,Host total listings count,Country,latitude,longitude,...,Last Review Date,Review Scores Rating,Review Scores Accuracy,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Review Scores Value,Reviews per month,Amenity_Count
0,5534229,A 2 Passi da San Pietro,28697142,Veronica,100.0,False,5.0,Italy,41.895878,12.45443,...,2015-08-29,90.0,9.0,10.0,8.0,8.0,9.0,9.0,0.08,11
1,5903406,cosy small apartment,1853799,Veronika,88.0,False,2.0,Austria,48.246033,16.340743,...,2017-09-09,87.0,9.0,10.0,10.0,10.0,10.0,8.0,0.27,12
2,5203533,Rummelig lejl i hjertet af KBH,24801534,Marianne,100.0,False,1.0,Denmark,55.681579,12.56277,...,2016-07-26,100.0,10.0,10.0,10.0,10.0,10.0,10.0,0.28,8
3,8858475,Modern; spacious and warm; with its own balcony,3000397,Lasse,100.0,True,1.0,Denmark,55.708413,12.596737,...,2017-05-07,100.0,10.0,10.0,10.0,10.0,9.0,10.0,0.79,23
4,4215511,Nice room; Bohemian Cph; N�rrebro,126020,Lea,100.0,True,2.0,Denmark,55.687506,12.558365,...,2017-05-08,94.0,10.0,10.0,9.0,10.0,10.0,10.0,1.4,10


In [21]:
df_countries.to_pickle("C:/Users/Admin/Documents/ironhack/AirBnB_data/airbnb_listings_countries.pkl")

2. US data set

In [22]:
df_usa = df[df["Country"] == "United States"].reset_index(drop=True)

In [24]:
df_usa = df_usa[['Listing ID', 'Name', 'Host ID', 'Host Name', 'Host Response Rate',
               'Host Is Superhost', 'Host total listings count', 'Street', 'City',
               'Neighbourhood cleansed', 'State', 'Country', 'latitude', 'longitude',
               'Property type', 'Room type', 'Accommodates', 'Bathrooms', 'Bedrooms',
               'Amenities', 'Price', 'Minimum nights', 'Maximum nights',
               'Availability 365', 'Calendar last scraped', 'Number of reviews',
               'Last Review Date', 'Review Scores Rating', 'Review Scores Accuracy',
               'Review Scores Cleanliness', 'Review Scores Checkin',
               'Review Scores Communication', 'Review Scores Location',
               'Review Scores Value', 'Reviews per month',
               'Amenity_Count']]

In [25]:
df_usa.to_pickle("C:/Users/Admin/Documents/ironhack/AirBnB_data/airbnb_listings_usa.pkl")

3. Amenities data set