In [1]:
import argparse
import logging
import os
import shutil
import matplotlib.pyplot as plt

import mlflow
import json

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline


* 'schema_extra' has been renamed to 'json_schema_extra'


In [167]:
df=pd.read_csv("sample1.csv")

In [168]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20000 non-null  int64  
 1   name                            19993 non-null  object 
 2   host_id                         20000 non-null  int64  
 3   host_name                       19992 non-null  object 
 4   neighbourhood_group             20000 non-null  object 
 5   neighbourhood                   20000 non-null  object 
 6   latitude                        20000 non-null  float64
 7   longitude                       20000 non-null  float64
 8   room_type                       20000 non-null  object 
 9   price                           20000 non-null  int64  
 10  minimum_nights                  20000 non-null  int64  
 11  number_of_reviews               20000 non-null  int64  
 12  last_review                     

In [6]:
ordinal_categorical = ["room_type"]
non_ordinal_categorical = ["neighbourhood_group"]

In [21]:
ordinal_categorical_preproc = OrdinalEncoder()
non_ordinal_categorical_preproc = make_pipeline(    # YOUR CODE HERE
        SimpleImputer(strategy="most_frequent"),
        OneHotEncoder()
)    

In [23]:
zero_imputed = [
        "minimum_nights",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "availability_365",
        "longitude",
        "latitude"
]
zero_imputer = SimpleImputer(strategy="constant", fill_value=0)

In [122]:
from sklearn.preprocessing import Normalizer

In [185]:
zero_imputer = make_pipeline(
        SimpleImputer(strategy='constant', fill_value=0),
        #Normalizer()
    )

In [186]:
def delta_date_feature(dates):
    """
    Given a 2d array containing dates (in any format recognized by pd.to_datetime), it returns the delta in days
    between each date and the most recent date in its column
    """
    date_sanitized = pd.DataFrame(dates).apply(pd.to_datetime)
    return date_sanitized.apply(lambda d: (d.max() -d).dt.days, axis=0).to_numpy()


In [187]:
date_imputer = make_pipeline(
        SimpleImputer(strategy='constant', fill_value='2010-01-01'),
        FunctionTransformer(delta_date_feature, check_inverse=False, validate=False),
        #Normalizer()
    )

In [188]:
reshape_to_1d = FunctionTransformer(np.reshape, kw_args={"newshape": -1})
name_tfidf = make_pipeline(
        SimpleImputer(strategy="constant", fill_value=""),
        reshape_to_1d,
        TfidfVectorizer(
            binary=False,
            max_features=10,
            stop_words='english'
        ),
    )

In [189]:
preprocessor = ColumnTransformer(
        transformers=[
            ("ordinal_cat", ordinal_categorical_preproc, ordinal_categorical),
            ("non_ordinal_cat", non_ordinal_categorical_preproc, non_ordinal_categorical),
            ("impute_zero", zero_imputer, zero_imputed),
            ("transform_date", date_imputer, ["last_review"]),
            ("transform_name", name_tfidf, ["name"])
        ],
        remainder="drop",  # This drops the columns that we do not transform
    )

In [190]:
processed_features = ordinal_categorical + non_ordinal_categorical + zero_imputed + ["last_review", "name"]

# Create random forest
random_Forest = RandomForestRegressor(n_estimators=100,max_depth=15,min_samples_split=4,oob_score=True,
                                      min_samples_leaf=3,n_jobs=-1,criterion="squared_error",max_features=0.33)

######################################
# Create the inference pipeline. The pipeline must have 2 steps: a step called "preprocessor" applying the
# ColumnTransformer instance that we saved in the `preprocessor` variable, and a step called "random_forest"
# with the random forest instance that we just saved in the `random_forest` variable.
# HINT: Use the explicit Pipeline constructor so you can assign the names to the steps, do not use make_pipeline
sk_pipe = Pipeline(  # YOUR CODE HERE
    steps=[
        ("preprocessor", preprocessor),
        ("random_forest", random_Forest),
    ]
)

In [191]:
idx = df['price'].between(10, 350)
df = df[idx].copy()
idx = df['longitude'].between(-74.25, -73.50) & df['latitude'].between(40.5, 41.2)
df = df[idx].copy()

In [192]:
df_1, df_test = train_test_split(
        df, test_size=0.2, stratify=df['neighbourhood_group'], random_state=42
    )

In [193]:
X=df_1.copy()
y = X.pop("price")
X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, stratify=X["neighbourhood_group"], random_state=42
    )

In [205]:
y

19170    200
6238      30
16528     50
3361      89
14279     82
        ... 
4635      74
13259    100
11273    135
9681      55
16489     69
Name: price, Length: 15200, dtype: int64

In [194]:
sk_pipe.fit(X_train[processed_features], y_train)

In [195]:
r_squared = sk_pipe.score(X_val[processed_features], y_val)

y_pred = sk_pipe.predict(X_val[processed_features])
mae = mean_absolute_error(y_val, y_pred)

In [196]:
r_squared,mae

(0.562894281576592, 33.79820386014395)

In [184]:
r_squared,mae

(0.5077302212480546, 36.50267346046246)

In [197]:
aaa=sk_pipe['preprocessor'].fit_transform(X_train[processed_features], y_train)

In [198]:
for i in range(24):
    print(aaa[:,i].max())

2.0
1.0
1.0
1.0
1.0
1.0
999.0
594.0
27.95
327.0
365.0
-73.71795
40.91306
3475.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


In [211]:
y_val.values

array([190, 190, 145, ...,  90, 200, 195])

In [218]:
for i,j in enumerate(y_pred):
    print(f'{y_val.iloc[i]},{j:3.0f}')

190,171
190,211
145,170
135, 91
123,142
125,151
145,224
50, 61
263,239
45, 66
130,140
100, 79
100, 71
28, 59
138,165
125,142
50, 60
130,175
96, 94
70, 71
100,138
96,201
46, 71
175,221
147, 98
45,137
60, 62
130,114
190,121
115, 92
55, 60
119,111
90, 94
60, 73
50,165
80, 58
45, 65
76, 88
53, 74
60, 57
48, 76
47, 93
62, 75
295,173
75, 97
94,105
122,156
190,151
120,116
100,135
250,101
165,169
65, 67
40, 66
175,157
110,129
250,215
30, 64
55, 80
137,223
65, 58
105,115
63,102
249,174
250,100
130,113
125, 80
60, 68
65, 89
50, 59
135,199
145,158
38, 62
99, 94
200,178
98,134
50, 67
65, 58
58, 71
200,209
220,186
175,160
40, 93
150,168
300,150
66, 64
150,150
80, 90
50, 76
125,141
55, 73
150,155
109,184
101,195
99, 88
45, 68
50, 86
50, 60
125,131
135,192
150,173
55, 63
122,121
57, 60
250,222
160,165
150,194
78, 68
100, 98
175,170
105, 62
150,108
40, 68
56, 71
250, 66
150, 81
30, 60
271,271
235,212
275,238
82,135
175,100
120,126
95,132
90, 62
49, 59
161,160
69, 80
120,115
80,154
119,118
60, 54
110,1