In [17]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")
from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from autoviz import AutoViz_Class
from feature_engine.encoding import RareLabelEncoder

In [18]:
DATA = '../data/data.csv'

df0 = pd.read_csv(DATA, index_col='id')
df1 = pd.read_csv(DATA, index_col='id')
use_cols = ['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood',
       'latitude', 'longitude', 'room_type', 'price', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365',
       'number_of_reviews_ltm', 'license']
df2 = pd.read_csv(DATA, usecols=use_cols, index_col='id')
df = pd.concat([df0, df1, df2], axis=0).drop_duplicates()
del df0, df1, df2
print(df.shape)

df.describe().T
df[df['price']>50000].T

(41516, 21)


id,17160286,605115521796576121,17160286.1,605115521796576121.1
name,Rental unit in Brooklyn · ★4.48 · 1 bedroom · 1 bed · 1 shared bath,Rental unit in Brooklyn · ★4.33 · 1 bedroom · 1 bed · 1 shared bath,Rental unit in Brooklyn · ★4.48 · 1 bedroom · 1 bed · 1 shared bath,Rental unit in Brooklyn · ★4.33 · 1 bedroom · 1 bed · 1 shared bath
host_id,110361431,110361431,110361431,110361431
host_name,Bobbi,Bobbi,Bobbi,Bobbi
neighbourhood_group,Brooklyn,Brooklyn,Brooklyn,Brooklyn
neighbourhood,Bedford-Stuyvesant,Bedford-Stuyvesant,Bedford-Stuyvesant,Bedford-Stuyvesant
latitude,40.69085,40.69254,40.69085,40.69254
longitude,-73.93806,-73.93636,-73.93806,-73.93636
room_type,Private room,Private room,Private room,Private room
price,100000.0,100000.0,100000.0,100000.0
minimum_nights,30,30,30,30


In [19]:
AV = AutoViz_Class()

# Initialize variables
filename = ""  # Specify the filename of the dataset (empty in this case)
target_variable = 'price'  # Specify the target variable for analysis
custom_plot_dir = "custom_plot_directory"  # Specify the directory to save custom plots

# Perform automated EDA using the AutoViz library
# The following parameters are used:
# - filename: Empty in this case as the data is provided directly as 'df'
# - sep: Delimiter used in the data (comma in this case)
# - depVar: Target variable for analysis ('rating' in this case)
# - dfte: DataFrame to be analyzed ('df' is assumed to be defined earlier)
# - header: Indicates that the first row contains column names (0 for True)
# - verbose: Verbosity level (1 for verbose output)
# - lowess: Smoothing using Lowess algorithm (False for no smoothing)
# - chart_format: Format in which charts will be generated (HTML format in this case)
# - max_rows_analyzed: Maximum number of rows to analyze (up to 10,000 rows)
# - max_cols_analyzed: Maximum number of columns to analyze (up to 50 columns)
# - save_plot_dir: Directory to save the generated plots ('custom_plot_directory' in this case)
try:
    dft = AV.AutoViz(
        filename,
        sep=",",
        depVar=target_variable,
        dfte=df,
        header=0,
        lowess=False,
        chart_format="html",
        max_rows_analyzed=min([df.shape[0], 10**3]),
        max_cols_analyzed=min([df.shape[1], 50]),
        save_plot_dir=custom_plot_dir
    )
except Exception as e:
    print(f"Exception: {e}")

    Since nrows is smaller than dataset, loading random sample of 1000 rows into pandas...
Shape of your Data Set loaded: (1000, 21)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    Number of Numeric Columns =  4
    Number of Integer-Categorical Columns =  6
    Number of String-Categorical Columns =  6
    Number of Factor-Categorical Columns =  0
    Number of String-Boolean Columns =  0
    Number of Numeric-Boolean Columns =  0
    Number of Discrete String Columns =  4
    Number of NLP String Columns =  0
    Number of Date Time Columns =  0
    Number of ID Columns =  0
    Number of Columns to Delete =  0
    20 Predictors classified...
        No variables removed since no ID or low-information variables found in data set
Si

Saving scatterplots in HTML format
                                     

Saving pair_scatters in HTML format
                                               





Saving distplots_cats in HTML format
                                               

Saving distplots_nums in HTML format
                                     

Saving kde_plots in HTML format


Saving violinplots in HTML format


No date vars could be found in data set


Saving heatmaps in HTML format


Saving cat_var_plots in HTML format
                                               



Time to run AutoViz (in seconds) = 5


In [20]:
# Import the necessary library for displaying HTML content
from IPython.core.display import display, HTML

# Import the pathlib library to work with file paths
from pathlib import Path

# Initialize an empty list to store file names
file_names = []

# Use pathlib to iterate through HTML files in a specific directory
for file in Path(f'../src/{custom_plot_dir}/{target_variable}/').glob('*.html'):
    
    # Extract the filename from the full path and add it to the list
    filename = str(file).split('/')[-1]
    file_names.append(filename)

# Iterate through the list of file names and display each HTML file
for file_name in file_names:
    
    # Construct the full file path for each HTML file
    file_path = f'../src/{custom_plot_dir}/{target_variable}/{file_name}'
    # Open the HTML file for reading
    with open(file_path, 'r') as file:
        
        # Read the content of the HTML file
        html_content = file.read()
        
        # Display the HTML content using IPython
        display(HTML(html_content))

In [21]:
main_label = 'price'
# Exclude 1% of smallest and 1% of highest prices
P = np.percentile(df[main_label], [1, 99])
df = df[(df[main_label] > P[0]) & (df[main_label] < P[1])]
# combine neighbourbood and neighbourhood_group
df['neighbourhood'] = df['neighbourhood'] + ', ' + df['neighbourhood_group']
# log10-transform columns and group for larger bins
for col in ['minimum_nights', 'number_of_reviews', 'calculated_host_listings_count', 'availability_365']:
    df[f'log10_{col}'] = df[col].apply(lambda x: 1/5*round(5*np.log10(1+x)))
    df = df.drop([col], axis=1)
# set up the rare label encoder limiting number of categories to max_n_categories
for col in ['neighbourhood', 'room_type']:
    encoder = RareLabelEncoder(n_categories=1, max_n_categories=120, replace_with='Other', tol=20/df.shape[0])
    df[col] = encoder.fit_transform(df[[col]])
# drop unused columns
cols2drop = ['name', 'host_id', 'host_name', 'latitude', 'longitude', 'license', 'neighbourhood_group', 
             'last_review', 'reviews_per_month', 'number_of_reviews_ltm']
df = df.drop(cols2drop, axis=1)
df = df.dropna()
print(df.shape)
df.sample(5).T

(20300, 11)


id,1134365,21596559,978944089921352176,292800,41425624
neighbourhood,"Red Hook, Brooklyn","Flushing, Queens","Midtown, Manhattan","West Village, Manhattan","Midtown, Manhattan"
room_type,Entire home/apt,Private room,Private room,Entire home/apt,Private room
price,210.0,70.0,41.0,160.0,815.0
rating,5.0,4.83,4.0,4.87,No rating
bedrooms,2,1,1,1,1
beds,2.0,1.0,1.0,1.0,1.0
baths,1,1,2,1,1
log10_minimum_nights,1.4,1.4,0.4,1.8,0.4
log10_number_of_reviews,1.0,1.8,1.0,2.0,0.4
log10_calculated_host_listings_count,0.4,0.6,1.8,0.4,1.0


In [36]:
y = df[main_label].values.reshape(-1,)
X = df.drop([main_label], axis=1)
cat_cols = df.select_dtypes(include=['object']).columns
cat_cols_idx = [list(X.columns).index(c) for c in cat_cols]
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# initialize Pool
train_pool = Pool(X_train, 
                  y_train, 
                  cat_features=cat_cols_idx)
test_pool = Pool(X_test,
                 y_test,
                 cat_features=cat_cols_idx)
# specify the training parameters 
model = CatBoostRegressor(iterations=5000, 
                          depth=5,
                          verbose=0,
                          learning_rate=0.001, 
                          loss_function='RMSE')
# train the model
model.fit(train_pool)
# make the prediction using the resulting model
y_train_pred = model.predict(train_pool)
y_test_pred = model.predict(test_pool)

rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
print(f"RMSE score for train {round(rmse_train,2)} USD, and for test {round(rmse_test,2)} USD")

print(pd.DataFrame(y_train_pred))
print(pd.DataFrame(y_test_pred))

RMSE score for train 89.86 USD, and for test 87.68 USD
            0    
0      108.452027
1      178.515943
2      114.351797
3      100.815472
4      181.948561
...           ...
16235  128.059781
16236  423.089024
16237   86.692881
16238  198.525532
16239  197.790493

[16240 rows x 1 columns]
           0    
0     125.004015
1     218.373070
2     170.897000
3     118.460198
4     152.106155
...          ...
4055  145.465856
4056  203.247697
4057  157.716364
4058  224.685809
4059   77.669867

[4060 rows x 1 columns]


In [23]:
rmse_bs_train = mean_squared_error(y_train, [np.mean(y_train)]*len(y_train), squared=False)
rmse_bs_test = mean_squared_error(y_test, [np.mean(y_train)]*len(y_test), squared=False)
print(f"RMSE baseline score for train {round(rmse_bs_train, 2)} USD, and for test {round(rmse_bs_test, 2)} USD")

RMSE baseline score for train 126.98 USD, and for test 125.67 USD
