# Feature Engineering/Preprocessing

In [67]:
import pandas as pd
from sklearn.preprocessing import RobustScaler

In [None]:
eda_df = pd.read_csv('data/processed/EDA.csv') # Add ../ prefix if running notebook directly

In [71]:
eda_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  float64
 10  minimum_nights                  48895 non-null  float64
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [73]:
# Function to identify categorical and numeric variables
def identify_columns(df, categorical_threshold=10, cardinal_threshold=20):
    categorical_variables = [col for col in df.columns if df[col].dtypes == "O"]
    numeric_as_categorical = [
        col for col in df.columns if df[col].nunique() < categorical_threshold and df[col].dtypes != "O"
    ]
    cardinal_variables = [
        col for col in df.columns if df[col].nunique() > cardinal_threshold and df[col].dtypes == "O"
    ]
    categorical_variables = [
        col for col in categorical_variables + numeric_as_categorical if col not in cardinal_variables
    ]
    numeric_variables = [
        col for col in df.columns if df[col].dtypes != "O" and col not in numeric_as_categorical
    ]
    return categorical_variables, numeric_variables

categorical_variables, numeric_variables = identify_columns(eda_df)

In [75]:
# Calculate the total cost for the minimum stay
eda_df['total_cost_min_stay'] = eda_df['price'] * eda_df['minimum_nights']

# Estimate the listing duration based on reviews
eda_df['listing_duration_months'] = eda_df['number_of_reviews'] / eda_df['reviews_per_month']

# Calculate the availability ratio
eda_df['availability_ratio'] = eda_df['availability_365'] / 365

# Compute the daily average reviews
eda_df['daily_reviews_avg'] = eda_df['reviews_per_month'] / 30

# Estimate annual potential earnings
eda_df['annual_potential_income'] = eda_df['price'] * eda_df['availability_365']

# Calculate average stay duration
eda_df['avg_stay_duration'] = eda_df['number_of_reviews'] / eda_df['reviews_per_month']

# Compute the occupancy rate
eda_df['occupancy_rate'] = 365 - eda_df['availability_365']

# Determine the minimum potential earnings
eda_df['min_potential_income'] = eda_df['price'] * eda_df['minimum_nights']

In [77]:
eda_df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'total_cost_min_stay', 'listing_duration_months',
       'availability_ratio', 'daily_reviews_avg', 'annual_potential_income',
       'avg_stay_duration', 'occupancy_rate', 'min_potential_income'],
      dtype='object')

In [79]:
# One Hot Encode variables
df = pd.get_dummies(eda_df, columns=categorical_variables, drop_first=True)
df = df.replace({True: 1, False: 0})
df.head()

  df = df.replace({True: 1, False: 0})


Unnamed: 0,id,host_id,neighbourhood,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,...,last_review_2019-06-28,last_review_2019-06-29,last_review_2019-06-30,last_review_2019-07-01,last_review_2019-07-02,last_review_2019-07-03,last_review_2019-07-05,last_review_2019-07-06,last_review_2019-07-07,last_review_Rare
0,2539,2787,Rare,40.64749,-73.97237,149.0,1.0,9,0.21,6,...,0,0,0,0,0,0,0,0,0,1
1,2595,2845,Midtown,40.75362,-73.98377,225.0,1.0,45,0.38,2,...,0,0,0,0,0,0,0,0,0,1
2,3647,4632,Harlem,40.80902,-73.9419,150.0,3.0,0,1.373221,1,...,0,0,0,0,0,0,0,0,0,0
3,3831,4869,Clinton Hill,40.68514,-73.95976,89.0,1.0,270,4.64,1,...,0,0,0,0,0,0,1,0,0,0
4,5022,7192,East Harlem,40.79851,-73.94399,80.0,10.0,9,0.1,1,...,0,0,0,0,0,0,0,0,0,1


In [80]:
# Scale Numeric Data
numeric_variables = [col for col in numeric_variables if col in df.columns and col not in ["price"]]
scaler = RobustScaler()
df[numeric_variables] = scaler.fit_transform(df[numeric_variables])
df.head()

Unnamed: 0,id,host_id,neighbourhood,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,...,last_review_2019-06-28,last_review_2019-06-29,last_review_2019-06-30,last_review_2019-07-01,last_review_2019-07-02,last_review_2019-07-03,last_review_2019-07-05,last_review_2019-07-06,last_review_2019-07-07,last_review_Rare
0,-0.999721,-0.309108,Rare,-1.03513,-0.356662,149.0,-0.5,0.173913,-0.776923,5.0,...,0,0,0,0,0,0,0,0,0,1
1,-0.999718,-0.309108,Midtown,0.418407,-0.600278,225.0,-0.5,1.73913,-0.646154,1.0,...,0,0,0,0,0,0,0,0,0,1
2,-0.999665,-0.30909,Harlem,1.177155,0.294476,150.0,0.0,-0.217391,0.117863,0.0,...,0,0,0,0,0,0,0,0,0,0
3,-0.999655,-0.309088,Clinton Hill,-0.519482,-0.087189,89.0,-0.5,11.521739,2.630769,0.0,...,0,0,0,0,0,0,1,0,0,0
4,-0.999595,-0.309064,East Harlem,1.033212,0.249813,80.0,1.75,0.173913,-0.861538,0.0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
df.to_csv('data/processed/preprocessed_data.csv', index=False) # Add ../ prefix if running notebook directly