#Import dataset


## Import Library

In [7]:
import pandas as pd
import numpy as np
import zipfile
import os

In [4]:
!pip install kaggle



## Load Dataset from Kaggle

You will need to upload your Kaggle API key (kaggle.json) to download the dataset. You can obtain this from your Kaggle account settings.

In [11]:
from google.colab import files

files.upload()

Saving kaggle.json to kaggle (1).json


{'kaggle (1).json': b'{"username":"kaylaaisya","key":"b92c1f6e772ea6b87b19c608e7b8c4b6"}'}

Now I'll download the dataset using the Kaggle API

In [13]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
# Replace 'kaggle_dataset_name' with the actual dataset name
!kaggle datasets download -d arianazmoudeh/airbnbopendata

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Dataset URL: https://www.kaggle.com/datasets/arianazmoudeh/airbnbopendata
License(s): ODbL-1.0
airbnbopendata.zip: Skipping, found more recently modified local copy (use --force to force download)


In [17]:
# Unzip the dataset
with zipfile.ZipFile('airbnbopendata.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

# Find the CSV file in the extracted files (assuming there's only one or it's named predictably)
csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
if csv_files:
    csv_file_path = csv_files[0]
    df = pd.read_csv(csv_file_path)
    display(df.head())
else:
    print("No CSV file found in the extracted data.")

  df = pd.read_csv(csv_file_path)


Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,country,...,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules,license
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,United States,...,$193,10.0,9.0,10/19/2021,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...,
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,United States,...,$28,30.0,45.0,5/21/2022,0.38,4.0,2.0,228.0,Pet friendly but please confirm with me if the...,
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,78829239556,,Elise,Manhattan,Harlem,40.80902,-73.9419,United States,...,$124,3.0,0.0,,,5.0,1.0,352.0,"I encourage you to use my kitchen, cooking and...",
3,1002755,,85098326012,unconfirmed,Garry,Brooklyn,Clinton Hill,40.68514,-73.95976,United States,...,$74,30.0,270.0,7/5/2019,4.64,4.0,1.0,322.0,,
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,United States,...,$41,10.0,9.0,11/19/2018,0.1,3.0,1.0,289.0,"Please no smoking in the house, porch or on th...",


# Data Cleaning Steps

1. Deleting Redundant Columns
2. Renaming Columns
3. Dropping Duplicates
4. Removing Missing Values
5. Clean Each Columns
6. Create Cleaned Dataset

# 1. Deleting Redundant Columns

*   'country' and 'country code' are the same in all rows
*   'id' and 'host id' are both unique identifiers, but we will keep both of themm for easier searching process



In [18]:
df_drop = df.drop(['country', 'country code'], axis=1)

# 2. Renaming Columns

Modify each columns to snake case for consistency

In [19]:
def to_snake_case(name):
    """Converts a string to snake_case."""
    name = name.replace(' ', '_').replace('-', '_').lower()
    name = ''.join(c for c in name if c.isalnum() or c == '_')
    return name

# Rename columns to snake case
df_drop.columns = [to_snake_case(col) for col in df_drop.columns]

# Display the new column names
print("New column names:")
print(df_drop.columns)

New column names:
Index(['id', 'name', 'host_id', 'host_identity_verified', 'host_name',
       'neighbourhood_group', 'neighbourhood', 'lat', 'long',
       'instant_bookable', 'cancellation_policy', 'room_type',
       'construction_year', 'price', 'service_fee', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'review_rate_number', 'calculated_host_listings_count',
       'availability_365', 'house_rules', 'license'],
      dtype='object')


# 3. Dropping Duplicates

In [20]:
df_drop.duplicated().value_counts()

Unnamed: 0,count
False,102058
True,541


In [21]:
df_drop.drop_duplicates(inplace = True)
df_drop.duplicated().value_counts()

Unnamed: 0,count
False,102058


# 4. Removing Missing Values

In [22]:
df_drop.isna().sum()

Unnamed: 0,0
id,0
name,250
host_id,0
host_identity_verified,289
host_name,404
neighbourhood_group,29
neighbourhood,16
lat,8
long,8
instant_bookable,105


In [23]:
# Calculate the percentage of missing values for each column
missing_values_percentage = (df.isnull().sum() / len(df)) * 100

print("Percentage of missing values per column:")
print(missing_values_percentage)

Percentage of missing values per column:
id                                 0.000000
NAME                               0.243667
host id                            0.000000
host_identity_verified             0.281679
host name                          0.395715
neighbourhood group                0.028265
neighbourhood                      0.015595
lat                                0.007797
long                               0.007797
country                            0.518524
country code                       0.127682
instant_bookable                   0.102340
cancellation_policy                0.074075
room type                          0.000000
Construction year                  0.208579
price                              0.240743
service fee                        0.266084
minimum nights                     0.398639
number of reviews                  0.178364
last review                       15.490404
reviews per month                 15.476759
review rate number                 

'house_rules' and 'license' have more than half of the row missing. If we remove every rows with their values missing, we will lose many information. Instead we will drop 'house_rules' and 'license' instead.

In [24]:
df_drop = df_drop.drop(['house_rules', 'license'], axis=1)
print(df_drop.columns)

Index(['id', 'name', 'host_id', 'host_identity_verified', 'host_name',
       'neighbourhood_group', 'neighbourhood', 'lat', 'long',
       'instant_bookable', 'cancellation_policy', 'room_type',
       'construction_year', 'price', 'service_fee', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'review_rate_number', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')


Drop missing values

In [25]:
df_clean = df_drop.dropna()
display(df_clean.head())

Unnamed: 0,id,name,host_id,host_identity_verified,host_name,neighbourhood_group,neighbourhood,lat,long,instant_bookable,...,construction_year,price,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,False,...,2020.0,$966,$193,10.0,9.0,10/19/2021,0.21,4.0,6.0,286.0
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,False,...,2007.0,$142,$28,30.0,45.0,5/21/2022,0.38,4.0,2.0,228.0
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,False,...,2009.0,$204,$41,10.0,9.0,11/19/2018,0.1,3.0,1.0,289.0
5,1004098,Large Cozy 1 BR Apartment In Midtown East,45498551794,verified,Michelle,Manhattan,Murray Hill,40.74767,-73.975,True,...,2013.0,$577,$115,3.0,74.0,6/22/2019,0.59,3.0,1.0,374.0
7,1005202,BlissArtsSpace!,90821839709,unconfirmed,Emma,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,False,...,2009.0,"$1,060",$212,45.0,49.0,10/5/2017,0.4,5.0,1.0,219.0


In [26]:
df_clean.isna().sum()

Unnamed: 0,0
id,0
name,0
host_id,0
host_identity_verified,0
host_name,0
neighbourhood_group,0
neighbourhood,0
lat,0
long,0
instant_bookable,0


# 5. Clean Each Columns

In [27]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 83847 entries, 0 to 102040
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              83847 non-null  int64  
 1   name                            83847 non-null  object 
 2   host_id                         83847 non-null  int64  
 3   host_identity_verified          83847 non-null  object 
 4   host_name                       83847 non-null  object 
 5   neighbourhood_group             83847 non-null  object 
 6   neighbourhood                   83847 non-null  object 
 7   lat                             83847 non-null  float64
 8   long                            83847 non-null  float64
 9   instant_bookable                83847 non-null  object 
 10  cancellation_policy             83847 non-null  object 
 11  room_type                       83847 non-null  object 
 12  construction_year               8384

##Change Column Type

* 'price' and 'service fee' is in a dollar, but they still have a object type. we will convert these columns into float

In [29]:
# Remove '$' and ',' from 'price' and 'service fee' and convert to float
df_clean['price'] = df_clean['price'].astype(str).str.replace('[$,]', '', regex=True)
df_clean['price'] = pd.to_numeric(df_clean['price'])

df_clean['service_fee'] = df_clean['service_fee'].astype(str).str.replace('[$,]', '', regex=True)
df_clean['service_fee'] = pd.to_numeric(df_clean['service_fee'])

# Display the data types to confirm the change
print(df_clean[['price', 'service_fee']].dtypes)
display(df_clean.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['price'] = df_clean['price'].astype(str).str.replace('[$,]', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['price'] = pd.to_numeric(df_clean['price'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['service_fee'] = df_clean['service_fee'].astype(str).str.

price          int64
service_fee    int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['service_fee'] = pd.to_numeric(df_clean['service_fee'])


Unnamed: 0,id,name,host_id,host_identity_verified,host_name,neighbourhood_group,neighbourhood,lat,long,instant_bookable,...,construction_year,price,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,False,...,2020.0,966,193,10.0,9.0,10/19/2021,0.21,4.0,6.0,286.0
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,False,...,2007.0,142,28,30.0,45.0,5/21/2022,0.38,4.0,2.0,228.0
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,False,...,2009.0,204,41,10.0,9.0,11/19/2018,0.1,3.0,1.0,289.0
5,1004098,Large Cozy 1 BR Apartment In Midtown East,45498551794,verified,Michelle,Manhattan,Murray Hill,40.74767,-73.975,True,...,2013.0,577,115,3.0,74.0,6/22/2019,0.59,3.0,1.0,374.0
7,1005202,BlissArtsSpace!,90821839709,unconfirmed,Emma,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,False,...,2009.0,1060,212,45.0,49.0,10/5/2017,0.4,5.0,1.0,219.0


* Convert 'last_review' to datetime

In [30]:
df_clean['last_review'] = pd.to_datetime(df_clean['last_review'], errors='coerce')
display(df_clean.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['last_review'] = pd.to_datetime(df_clean['last_review'], errors='coerce')


Unnamed: 0,id,name,host_id,host_identity_verified,host_name,neighbourhood_group,neighbourhood,lat,long,instant_bookable,...,construction_year,price,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,False,...,2020.0,966,193,10.0,9.0,2021-10-19,0.21,4.0,6.0,286.0
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,False,...,2007.0,142,28,30.0,45.0,2022-05-21,0.38,4.0,2.0,228.0
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,False,...,2009.0,204,41,10.0,9.0,2018-11-19,0.1,3.0,1.0,289.0
5,1004098,Large Cozy 1 BR Apartment In Midtown East,45498551794,verified,Michelle,Manhattan,Murray Hill,40.74767,-73.975,True,...,2013.0,577,115,3.0,74.0,2019-06-22,0.59,3.0,1.0,374.0
7,1005202,BlissArtsSpace!,90821839709,unconfirmed,Emma,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,False,...,2009.0,1060,212,45.0,49.0,2017-10-05,0.4,5.0,1.0,219.0


* Remove white spaces in 'name'

In [31]:
df_clean['name'] = df_clean['name'].str.strip().str.replace('\s+', ' ', regex=True)

display(df_clean['name'].head())

  df_clean['name'] = df_clean['name'].str.strip().str.replace('\s+', ' ', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['name'] = df_clean['name'].str.strip().str.replace('\s+', ' ', regex=True)


Unnamed: 0,name
0,Clean & quiet apt home by the park
1,Skylit Midtown Castle
4,Entire Apt: Spacious Studio/Loft by central park
5,Large Cozy 1 BR Apartment In Midtown East
7,BlissArtsSpace!


* Change 'lat' and 'long' to text

In [32]:
df_clean['lat'] = df_clean['lat'].astype(str)
df_clean['long'] = df_clean['long'].astype(str)

print(df_clean[['lat', 'long']].dtypes)
display(df_clean.head())

lat     object
long    object
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['lat'] = df_clean['lat'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['long'] = df_clean['long'].astype(str)


Unnamed: 0,id,name,host_id,host_identity_verified,host_name,neighbourhood_group,neighbourhood,lat,long,instant_bookable,...,construction_year,price,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,False,...,2020.0,966,193,10.0,9.0,2021-10-19,0.21,4.0,6.0,286.0
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,False,...,2007.0,142,28,30.0,45.0,2022-05-21,0.38,4.0,2.0,228.0
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,False,...,2009.0,204,41,10.0,9.0,2018-11-19,0.1,3.0,1.0,289.0
5,1004098,Large Cozy 1 BR Apartment In Midtown East,45498551794,verified,Michelle,Manhattan,Murray Hill,40.74767,-73.975,True,...,2013.0,577,115,3.0,74.0,2019-06-22,0.59,3.0,1.0,374.0
7,1005202,BlissArtsSpace!,90821839709,unconfirmed,Emma,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,False,...,2009.0,1060,212,45.0,49.0,2017-10-05,0.4,5.0,1.0,219.0


##Check Data Validity

In [33]:
check_id = df_clean[(df_clean['id'] > 9999999) &
                    (df_clean['id'] < 1000000)]

check_host_id = df_clean[(df_clean['host_id'] > 99999999999) &
                        (df_clean['host_id'] < 10000000000)]

check_host_identity = df_clean[(df_clean['host_identity_verified'] != 'unconfirmed' ) &
                              (df_clean['host_identity_verified'] != 'verified')]

check_neighborhood_group = df_clean[(df_clean['neighbourhood_group'] != 'Brooklyn') &
                                    (df_clean['neighbourhood_group'] != 'Manhattan') &
                                    (df_clean['neighbourhood_group'] != 'Queens') &
                                    (df_clean['neighbourhood_group'] != 'Bronx') &
                                    (df_clean['neighbourhood_group'] != 'Staten Island')]

check_instant_book = df_clean[(df_clean['instant_bookable'] != True) &
                              (df_clean['instant_bookable'] != False)]

check_cancellation = df_clean[(df_clean['cancellation_policy'] != 'strict') &
                              (df_clean['cancellation_policy'] != 'moderate') &
                              (df_clean['cancellation_policy'] != 'flexible')]

check_room_type = df_clean[(df_clean['room_type'] != 'Entire home/apt') &
                          (df_clean['room_type'] != 'Private room') &
                           (df_clean['room_type'] != 'Shared room') &
                           (df_clean['room_type'] != 'Hotel room')]

check_year = df_clean[(df_clean['construction_year'] > 2022)]

check_night = df_clean[df_clean['minimum_nights'] < 1]

check_last_review = df_clean[df_clean['last_review'].dt.year > 2022]

check_month_review = df_clean[df_clean['reviews_per_month'] < 0]

check_rate_review = df_clean[(df_clean['review_rate_number'] < 1) |
                            (df_clean['review_rate_number'] > 5)]

check_listing = df_clean[df_clean['calculated_host_listings_count'] < 0]

check_availability = df_clean[(df_clean['availability_365'] < 0)]

print('invalid values in each columns')
print('ID:',len(check_id))
print('host ID:',len(check_host_id))
print('host identity:',len(check_host_identity))
print('neighborhood group:',len(check_neighborhood_group))
print('instant bookable:',len(check_instant_book))
print('cancellation policy:',len(check_cancellation))
print('room type:',len(check_room_type))
print('construction year:',len(check_year))
print('minimum nights:',len(check_night))
print('last review:',len(check_last_review))
print('reviews per month:',len(check_month_review))
print('review rate number:',len(check_rate_review))
print('calculated host listings count:',len(check_listing))
print('availability:',len(check_availability))

invalid values in each columns
ID: 0
host ID: 0
host identity: 0
neighborhood group: 1
instant bookable: 0
cancellation policy: 0
room type: 0
construction year: 0
minimum nights: 10
last review: 0
reviews per month: 0
review rate number: 0
calculated host listings count: 0
availability: 330


There are several rows of data that have invalid values in 'neighborhood_group', 'minimum nights', and 'availability' columns.

* Neighborhood Columns

In [34]:
check_neighborhood_group

Unnamed: 0,id,name,host_id,host_identity_verified,host_name,neighbourhood_group,neighbourhood,lat,long,instant_bookable,...,construction_year,price,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365
13,1008516,"Lovely Room 1, Garden, Best Area, Legal rental",26802410424,verified,Darcy,brookln,South Slope,40.66829,-73.98779,True,...,2010.0,580,116,4.0,167.0,2019-06-24,1.34,4.0,3.0,47.0


This is a typo. We will replace neighborhood value in this row of data into the correct value, which is 'Brooklyn'

In [35]:
if not check_neighborhood_group.empty:
    row_index_to_correct = check_neighborhood_group.index[0]
    df_clean.loc[row_index_to_correct, 'neighbourhood_group'] = 'Brooklyn'

In [36]:
check_neighborhood_group = df_clean[(df_clean['neighbourhood_group'] != 'Brooklyn') &
                                    (df_clean['neighbourhood_group'] != 'Manhattan') &
                                     (df_clean['neighbourhood_group'] != 'Queens') &
                                      (df_clean['neighbourhood_group'] != 'Bronx') &
                                       (df_clean['neighbourhood_group'] != 'Staten Island')]

check_neighborhood_group

Unnamed: 0,id,name,host_id,host_identity_verified,host_name,neighbourhood_group,neighbourhood,lat,long,instant_bookable,...,construction_year,price,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365


* Minimum Nights

In [37]:
print(len(check_night))
check_night

10


Unnamed: 0,id,name,host_id,host_identity_verified,host_name,neighbourhood_group,neighbourhood,lat,long,instant_bookable,...,construction_year,price,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365
176,1098541,BROOKLYN VICTORIAN STYLE SUITE.....,83631499592,unconfirmed,Frederick,Brooklyn,Fort Greene,40.69098,-73.97113,False,...,2008.0,1155,231,-10.0,213.0,2019-06-24,2.0,5.0,2.0,19.0
398,1221151,SUPER BIG AND COZY PRIVATE BEDROOM,50336791874,verified,Tucker,Brooklyn,Kensington,40.64302,-73.97255,False,...,2015.0,779,156,-1.0,82.0,2019-05-19,0.94,2.0,2.0,131.0
421,1233854,Charming Nolita Apartment!!,7389895192,verified,Belinda,Manhattan,Nolita,40.72094,-73.99706,False,...,2008.0,874,175,-10.0,68.0,2019-06-10,0.69,5.0,1.0,13.0
441,1244900,Cozy apartment in a brownstone,81186886194,verified,Adelaide,Manhattan,Harlem,40.80497,-73.95016,False,...,2021.0,920,184,-12.0,203.0,2019-07-06,2.14,5.0,3.0,77.0
478,1265335,Charming upper west side apartment,89878315253,unconfirmed,Alen,Manhattan,Upper West Side,40.77886,-73.98042,True,...,2022.0,410,82,-2.0,129.0,2019-06-07,1.33,2.0,1.0,381.0
525,1291294,Chateau Style Brooklyn Loft for Singles or Cou...,2631536622,verified,Carlos,Brooklyn,Bedford-Stuyvesant,40.68967,-73.95445,False,...,2022.0,413,83,-3.0,42.0,2019-05-18,0.44,5.0,1.0,292.0
42500,24474086,2bd BOUTIQUE Apartament in the heart of MANHATTAN,2679070022,unconfirmed,Tom,Manhattan,Hell's Kitchen,40.76694,-73.98773,True,...,2009.0,711,142,-365.0,13.0,2019-07-07,5.91,4.0,4.0,0.0
42538,24495073,Newly Renovated Garden Apartment,98469733112,verified,Margie,Brooklyn,Bedford-Stuyvesant,40.6847,-73.9435,True,...,2022.0,85,17,-200.0,3.0,2019-04-23,1.06,2.0,1.0,157.0
69749,39523709,Amazing location! 10ft from L train,62132542936,verified,Giorgia & Benjamin,Brooklyn,Williamsburg,40.71534,-73.94906,False,...,2012.0,328,66,-125.0,146.0,2019-06-23,1.78,1.0,1.0,46.0
91357,51457807,Cozy Brooklyn Apartment,63988893317,verified,Daniella,Brooklyn,Gowanus,40.6707,-73.99118,True,...,2014.0,1177,235,-10.0,9.0,2017-05-08,0.31,2.0,1.0,0.0


These rows of data have invalid minimum nights because of their negative values. Since there are only 10 invalid rows, we wont be missing a lot of information if we drop them.

In [38]:
drop_condition = df_clean['minimum_nights'] < 1
df_clean = df_clean[~drop_condition]

check_night = df_clean[df_clean['minimum_nights'] < 1]
print(len(check_night))

0


* Availability 365

In [39]:
print(len(check_availability))
check_availability

330


Unnamed: 0,id,name,host_id,host_identity_verified,host_name,neighbourhood_group,neighbourhood,lat,long,instant_bookable,...,construction_year,price,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365
490,1271963,Gorgeous 1 bdrm in huge duplex!,80917711958,verified,Julian,Manhattan,Harlem,40.80224,-73.94558,True,...,2016.0,424,85,2.0,17.0,2015-10-01,0.18,3.0,2.0,-9.0
529,1293503,Columbus Circle Luxury Bldg - Private Room&Bath,86417259099,verified,Derek,Manhattan,Hell's Kitchen,40.7709,-73.99181,False,...,2021.0,619,124,28.0,43.0,2019-06-29,0.45,5.0,2.0,-4.0
561,1311176,Bright Modern Charming Housebarge,97069047757,verified,Luke,Brooklyn,Sheepshead Bay,40.58422,-73.94079,True,...,2007.0,441,88,4.0,128.0,2016-11-15,1.34,2.0,2.0,-8.0
621,1344314,Private Room With GREAT Location,41389421098,unconfirmed,Hunt,Queens,Long Island City,40.74581,-73.95295,True,...,2016.0,1112,222,30.0,65.0,2017-07-31,0.74,1.0,3.0,-6.0
683,1378557,Superior @ Box House,17112437857,verified,Murray,Brooklyn,Greenpoint,40.73731,-73.9545,False,...,2016.0,232,46,3.0,4.0,2015-12-04,0.05,2.0,28.0,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86504,48777495,Upscale 1 Bedroom Hell's Kitchen Apartment,50442314156,verified,Kara,Manhattan,Hell's Kitchen,40.76045,-73.9981,True,...,2017.0,471,94,30.0,1.0,2017-04-20,0.04,4.0,121.0,-2.0
86555,48805663,"Big, airy room in beautiful apartment",46518861360,verified,Sylvia,Brooklyn,Flatbush,40.65251,-73.96325,True,...,2007.0,158,32,4.0,1.0,2017-07-03,0.04,5.0,1.0,-6.0
86675,48871939,Private Condo Room w/ Patio,35368820987,verified,Agustina,Bronx,Longwood,40.81937,-73.90978,False,...,2005.0,614,123,2.0,68.0,2019-05-30,1.83,2.0,1.0,-7.0
86735,48905077,Comfortable 2 BR in East Village/Cooper Square,66319244844,unconfirmed,Cooper,Manhattan,East Village,40.72627,-73.99145,True,...,2005.0,335,67,3.0,27.0,2019-04-23,0.76,5.0,1.0,-4.0


In [40]:
drop_condition = df_clean['availability_365'] < 1
df_clean = df_clean[~drop_condition]

check_availability = df_clean[(df_clean['availability_365'] < 0)]
print(len(check_availability))

0


#Create Cleaned Dataset

In [41]:
df_clean.to_csv('clean_dataset.csv', index=False)