># TECHNICAL PAPER AIML CA1

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')
sns.set()

from sklearn.base import BaseEstimator, TransformerMixin

# Model evaluation
from sklearn.model_selection import train_test_split, cross_validate, KFold, learning_curve, cross_val_score
from sklearn.inspection import permutation_importance
from sklearn.metrics import SCORERS,make_scorer,mean_absolute_error, mean_absolute_percentage_error, r2_score, mean_squared_error

# preprocessing
from sklearn.pipeline import Pipeline,make_pipeline,FeatureUnion
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, PowerTransformer, PolynomialFeatures, OneHotEncoder, FunctionTransformer,LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import MissingIndicator, SimpleImputer, IterativeImputer

# Regression models
from sklearn.linear_model import LinearRegression, Ridge, Lasso,ElasticNet
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.cluster import AgglomerativeClustering
# Feature Selection
from sklearn.feature_selection import SelectFromModel, RFECV, SelectKBest, VarianceThreshold, mutual_info_regression, RFE

Source: [Data from here](http://insideairbnb.com/get-the-data/)

For EDA purposes (branca/folium)

In [2]:
# pip install folium
# pip install branca

In [3]:
import folium
import branca.colormap as cm

In [4]:
listings_df = pd.read_csv('datasets/listings.csv')
calendar_df = pd.read_csv('datasets/calendar.csv')
# Merge both
display(listings_df.head(3))
display(calendar_df.head(3))


Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,241032,https://www.airbnb.com/rooms/241032,20160104002432,2016-01-04,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,...,10.0,f,,WASHINGTON,f,moderate,f,f,2,4.07
1,953595,https://www.airbnb.com/rooms/953595,20160104002432,2016-01-04,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,none,"Queen Anne is a wonderful, truly functional vi...",...,10.0,f,,WASHINGTON,f,strict,t,t,6,1.48
2,3308979,https://www.airbnb.com/rooms/3308979,20160104002432,2016-01-04,New Modern House-Amazing water view,New modern house built in 2013. Spectacular s...,"Our house is modern, light and fresh with a wa...",New modern house built in 2013. Spectacular s...,none,Upper Queen Anne is a charming neighborhood fu...,...,10.0,f,,WASHINGTON,f,strict,f,f,2,1.15


Unnamed: 0,listing_id,date,available,price
0,241032,2016-01-04,t,$85.00
1,241032,2016-01-05,t,$85.00
2,241032,2016-01-06,f,


In [10]:
print(set(listings_df['id']) - set(calendar_df['listing_id']))
print(set(calendar_df['listing_id']) - set(listings_df['id']))

set()
set()


In [6]:
listings_cnt = listings_df['id'].count()
hosts = len(listings_df['host_id'].unique())

print(f'Dataset contains information on {listings_cnt} listings provided by {hosts} hosts.'  )

Dataset contains information on 3818 listings provided by 2751 hosts.


Drop columns we are not using

In [7]:
listings_df = listings_df.rename(columns={"id": "listing_id"})
df = pd.merge(calendar_df, listings_df, on = 'listing_id')

drops = ['availability_30', 'availability_365', 'availability_60',
    'availability_90', 'available', 'calendar_last_scraped',
    'calendar_updated', 'city', 'cleaning_fee', 'country',
    'country_code', 'description', 'experiences_offered',
    'first_review', 'has_availability', 'host_about',
    'host_acceptance_rate', 'host_id', 'host_location', 'host_name',
    'host_neighbourhood', 'host_picture_url', 'host_thumbnail_url',
    'host_total_listings_count', 'host_url', 'is_location_exact',
    'jurisdiction_names', 'last_review', 'last_scraped',
    'license', 'listing_url', 'market', 'medium_url',
    'monthly_price', 'name', 'neighborhood_overview', 'neighbourhood',
    'neighbourhood_cleansed', 'notes', 'picture_url', 'price_y',
    'requires_license', 'reviews_per_month', 'scrape_id',
    'security_deposit', 'smart_location', 'space', 'square_feet',
    'state', 'street', 'summary', 'thumbnail_url', 'transit',
    'weekly_price', 'xl_picture_url','host_since','extra_people',
     'review_scores_accuracy', 'review_scores_cleanliness',
                         'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
                        'review_scores_value','zipcode']
df = df.drop(columns = drops)

In [8]:
df['Month'] = df.apply(lambda i: int(i['date'].split('-')[1]),axis=1)
df['Year'] = df.apply(lambda o: int(o['date'].split('-')[0]),axis=1)
df = df.drop(columns = ['date'])
# string containing dates are converted into numbers

Dropping values from target variable

In [9]:
df = df.dropna(subset=['price_x'])

In [10]:
df['price'] = df['price_x'].astype(str)
df['price'] = df['price'].str.replace("[$, ]", "").astype("float")
df = df.drop(columns = ['price_x'])

In [11]:
def extract(row, column_name, value):
    val = 0.0
    try:
        vals = row[column_name].replace('[', '').replace("'", '').replace("]", '').replace('"', '').replace('{', '').replace('}', '').split(',')
        if value in vals:
            val = 1.0
    except:
        val = 0.0
    return val

In [12]:
amenities_list = list(df.amenities)
amenities_list_string = " ".join(amenities_list)
amenities_list_string = amenities_list_string.replace('{', '')
amenities_list_string = amenities_list_string.replace('}', ',')
amenities_list_string = amenities_list_string.replace('"', '')
amenities_set = [x.strip() for x in amenities_list_string.split(',') if x.strip()]
amenities_set = np.array(set(amenities_set))
amenities_set

array({'Elevator in Building', 'Breakfast', 'Wireless Internet', 'Dog(s)', 'Family/Kid Friendly', 'Hot Tub', 'Internet', 'Carbon Monoxide Detector', 'Laptop Friendly Workspace', 'Buzzer/Wireless Intercom', 'Hair Dryer', 'Fire Extinguisher', 'Lock on Bedroom Door', 'Iron', 'Gym', 'Pool', 'Doorman', 'Smoking Allowed', 'Essentials', 'Air Conditioning', 'Suitable for Events', 'Pets live on this property', 'Dryer', 'Other pet(s)', 'TV', 'Kitchen', 'Heating', 'Free Parking on Premises', '24-Hour Check-in', 'Washer', 'Shampoo', 'Hangers', 'Smoke Detector', 'Wheelchair Accessible', 'Safety Card', 'Pets Allowed', 'First Aid Kit', 'Indoor Fireplace', 'Washer / Dryer', 'Cat(s)', 'Cable TV'},
      dtype=object)

In [13]:
def OHE(df, column_name):
    d = {}
    for unique_value in df[column_name].unique(): 
        for value in unique_value.replace('[', '').replace("'", '').replace("]", '').replace('"', '').replace('{', '').replace('}', '').split(','):
            if value in d:
                d[value] = d[value] + 1
            else:
                d[value] = 0
    values_sorted = sorted(d.items(), key=lambda pair: pair[1], reverse = True)
      
    for value in values_sorted[: 10]:
        df[column_name + value[0]] = df.apply(lambda row: extract(row, column_name, value[0]),axis=1)
        
    return
OHE(df, 'amenities')
OHE(df,'host_verifications')
df = df.drop(columns = ['amenities', 'host_verifications'])

In [14]:
mapdf =df.groupby(['longitude', 'latitude'])[['price']].mean().reset_index()
start = (mapdf['latitude'].max() + mapdf['latitude'].min()) / 2
y_start = (mapdf['longitude'].max() + mapdf['longitude'].min()) / 2

begind = (start, y_start)
cmapz = cm.LinearColormap(colors=["#fde725", '#21918c', '#440154'],vmin=mapdf.price.min(),vmax=mapdf.price.max())
map = folium.Map(location=begind, zoom_start=11,tiles="cartodbpositron")

lat = list(mapdf.latitude)
lon = list(mapdf.longitude)
price = list(mapdf.price)

for loc,p in zip(zip(lat, lon), price):
    folium.Circle(
        location=loc,
        radius=10,
        fill=True,
        color= cmapz(p),
        fill_opacity=0.7
    ).add_to(map)

display(map)

In [15]:
df['host_response_rate_no'] = df['host_response_rate'].astype(str)
df['host_response_rate_no'] = df['host_response_rate_no'].str.replace("%", "").astype("float")
df['host_response_rate_no'].fillna(df['host_response_rate_no'].mean(), inplace = True)
df['host_response_rate_buckets'] = pd.qcut(df['host_response_rate_no'], 5, labels=False, duplicates = 'drop')

In [16]:
df = df.drop(columns = ['host_response_rate', 'host_response_rate_no'])
df['review_scores_rating'].fillna(df['review_scores_rating'].mean(), inplace = True)
df['host_listings_count'].fillna(df['host_listings_count'].mean(), inplace = True)
df['bathrooms'] = df['bathrooms'].fillna(df['bathrooms'].mode()[0])
df['bedrooms'] = df['bedrooms'].fillna(df['bedrooms'].mode()[0])
df['beds'] = df['beds'].fillna(df['beds'].mode()[0])

In [18]:
df.drop(['latitude', 'longitude','listing_id'],axis=1,inplace=True)

In [19]:
X,y = df.drop(columns = 'price'),df[['price']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [20]:
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self,feature):
        self.feature = feature
        self.le_dic = {}
        for el in self.feature:
            self.le_dic[el] = LabelEncoder()

    def fit(self,x,y=None):
        x[self.feature] = x[self.feature].fillna('NaN')
        for el in self.feature:
            a = x[el][x[el]!='NaN']
            self.le_dic[el].fit(a)
        return self
    def transform(self,x,y=None):
        x[self.feature] = x[self.feature].fillna('NaN')
        for el in self.feature:
            a = x[el][x[el]!='NaN']
            b = x[el].to_numpy()
            b[b!='NaN'] = self.le_dic[el].transform(a)
            x[el]=b
        return x

In [21]:
class Columns(BaseEstimator, TransformerMixin):
    def __init__(self, names=None):
        self.names = names
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X):
        return X[self.names]
cat_cols = df.columns[np.where(pd.DataFrame(df.dtypes == 'object')[0]== True)]
numeric_cols = [i for i in list(X_train.columns) if i not in list(cat_cols)]
preprocess = Pipeline([
    ("features", FeatureUnion([
        ('numeric', make_pipeline(Columns(names=numeric_cols),RobustScaler() )),
        ('categorical', make_pipeline(Columns(names=cat_cols),CustomLabelEncoder(cat_cols)))
    ]))
])
model = Pipeline([
    ('Prep',preprocess),
    ('Impute', IterativeImputer()),
    ('model', TransformedTargetRegressor(RandomForestRegressor(random_state=42),func=np.log,inverse_func=np.expm1))
])
# OneHotEncoder(handle_unknown='ignore',sparse=False)

In [22]:
model.fit(X_train,y_train)

Pipeline(steps=[('Prep',
                 Pipeline(steps=[('features',
                                  FeatureUnion(transformer_list=[('numeric',
                                                                  Pipeline(steps=[('columns',
                                                                                   Columns(names=['host_listings_count',
                                                                                                  'accommodates',
                                                                                                  'bathrooms',
                                                                                                  'bedrooms',
                                                                                                  'beds',
                                                                                                  'guests_included',
                                                                               

In [11]:

preds = model.predict(X_test)
print(mean_absolute_error(y_test,preds))
print(mean_absolute_percentage_error(y_test,preds))
print(mean_squared_error(y_test,preds,squared=False))

NameError: name 'model' is not defined

In [None]:
# numerical_features = ["accommodates", "beds", "bedrooms", "bathrooms_cleaned", "minimum_nights", "availability_365", "review_scores_rating"]
numerical_features = ["accommodates", "beds", "bathrooms_cleaned", "minimum_nights"]
categorical_not_encoded = ["neighbourhood_cleansed", "room_type"]
remainder = [x for x in X_train.columns if (x not in numerical_features and x not in categorical_not_encoded)]
ColumnTransformer(
    [
     ("Scaling", StandardScaler)
    ]
)
scale_impute = Pipeline([
                         ("Scaling", PowerTransformer()),
                         ("Impute Numerical", IterativeImputer(random_state=random_state, initial_strategy="median"))
])

# data_preprocess = Pipeline(
#     [("Standardize and OHE", ColumnTransformer(
#         [
#          ("Scaling and Impute", scale_impute, numerical_features),
#          ("One Hot Encode and Impute", ohe_impute, categorical_not_encoded)
#         ]
#     , remainder="passthrough")),
#      ("To Dense Array", FunctionTransformer(lambda x : x.todense(), accept_sparse=True))
#     ]
# )

data_preprocess = Pipeline(
    [("Imputation and Scaling", ColumnTransformer([
                                                  ("Scaling and Impute Numerical", scale_impute, numerical_features),
                                                  ("Impute Categorical", SimpleImputer(strategy="most_frequent"), categorical_not_encoded)
    ], remainder="passthrough")),
    ("To DataFrame", FunctionTransformer(func=pd.DataFrame, kw_args={
    "columns" : numerical_features + categorical_not_encoded + remainder
})),
    ("OHE", ColumnTransformer(
        [
         ("OHE", OneHotEncoder(handle_unknown="ignore"), categorical_not_encoded)
        ]
    , remainder="passthrough"))]
)
X_train_processed = data_preprocess.fit_transform(X_train)
feature_names = data_preprocess[-1].get_feature_names_out(numerical_features + categorical_not_encoded + remainder) 

In [None]:
# def outlierDetection(df,col):
#     col = df[col].apply(np.log1p)
#     q1= col.quantile(0.25)
#     q3 = col.quantile(0.75)
#     IQR = q3-q1
#     upperFence = q3 + 1.5 * IQR
#     lowerFence = q1 - 1.5 * IQR
#     mask = (col > upperFence) | (col < lowerFence)
#     return mask.values
# extreme_outlier = outlierDetection(airbnb_df, "price")
# airbnb_df['outlier'] = outlierDetection(airbnb_df, "price")
# extreme_outlier = airbnb_df[extreme_outlier]
# airbnb_df = airbnb_df[airbnb_df["outlier"] == 0]
# airbnb_df.reset_index(inplace=True)
# airbnb_df.drop('outlier',axis=1,inplace=True)