In [1]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from random import sample
from datetime import datetime
from statistics import mean

pd.options.display.max_columns = None

In [2]:
df_listings = pd.read_parquet('data/trusted/training/listing.parquet')

In [3]:
subpref = [i for i in df_listings.columns if 'subprefeitura_' in i]
amenities = [i for i in df_listings.columns if 'amenities_' in i]
room_type = [i for i in df_listings.columns if 'room_type_' in i]
bathroom_description = [i for i in df_listings.columns if 'bathroom_description_' in i]
property_type = [i for i in df_listings.columns if 'property_type_' in i]
extras = ['number_of_bathrooms',
          'bedrooms',
          'beds',
          'has_availability',
          'availability_30',
          'availability_60',
          'availability_90',
          'availability_365',
          'number_of_reviews_l30d',
          'number_of_reviews_ltm',
          'number_of_reviews',
          'last_review',
          'review_scores_rating',
          'review_scores_accuracy',
          'review_scores_cleanliness',
          'review_scores_checkin',
          'review_scores_communication',
          'review_scores_location',
          'review_scores_value']

features = subpref+amenities+room_type+bathroom_description+property_type+extras

preprocessa

In [4]:
for col in ["number_of_bathrooms",
            "bedrooms",
            "beds",
            "review_scores_rating",
            "review_scores_accuracy",
            "review_scores_cleanliness",
            "review_scores_checkin",
            "review_scores_communication",
            "review_scores_location",
            "review_scores_value"]:
    has_col = f'has_{col}'
    df_listings[has_col] = ~df_listings[col].isna()
    features.append(has_col)
    df_listings[col] = df_listings[col].fillna(-1)

In [5]:
compiled_date = datetime(2021, 3, 21)
df_listings['last_review'] = (compiled_date - df_listings['last_review']).apply(lambda d: d.days)
df_listings['has_last_review'] = ~df_listings.last_review.isna()
df_listings['last_review'] = df_listings['last_review'].fillna(-1)
features.append('has_last_review')

In [6]:
def separa_datasets(df, id_column, id_list):
    return df[df[id_column].isin(id_list)].copy().reset_index(drop=True)

In [7]:
ids = list(df_listings.id)

validation_size = 1000
validation_index = sample(range(len(ids)), validation_size)

validation_ids = {listing_id for index, listing_id in enumerate(ids) if index in validation_index}
training_ids   = {listing_id for index, listing_id in enumerate(ids) if index not in validation_index}

validation_listing  = separa_datasets(df_listings, 'id', validation_ids)
training_listing    = separa_datasets(df_listings, 'id', training_ids)

In [8]:
def model_fit(df, features):
    df_premium = df[df.subprefeitura.isin(['Sul', 'Barra da Tijuca'])]
    df_comum = df[~df.subprefeitura.isin(['Sul', 'Barra da Tijuca'])]
    clf_comum = RandomForestClassifier(n_estimators=30).fit(df_comum[features], df_comum['price'])
    clf_premium = RandomForestClassifier(n_estimators=30).fit(df_premium[features], df_premium['price'])
    return {'comum': clf_comum, 'premium': clf_premium}

In [9]:
def model_eval(df, features, model):
    df_premium = df[df.subprefeitura.isin(['Sul', 'Barra da Tijuca'])].copy()
    df_comum = df[~df.subprefeitura.isin(['Sul', 'Barra da Tijuca'])].copy()
    df_premium['price_predict'] = model['premium'].predict(df_premium[features])
    df_comum['price_predict'] = model['comum'].predict(df_comum[features])
    return df_comum.append(df_premium, ignore_index=True)

In [10]:
def funcao_custo(df, col, col_pred):
    df['diff'] = df[col] - df[col_pred]
    quadr_err = ((df['diff'] ** 2).sum() / df.shape[0]) ** (0.5)
    magn_err = (df['diff'].abs() > 1000).sum()
    quadr_err_no_out = ((df[df['diff'].abs() < 1000]['diff'] ** 2).sum() / (df.shape[0] - magn_err)) ** (0.5)
    print('hop!')
    return (quadr_err, quadr_err_no_out, magn_err, df[['id', col, col_pred, 'diff']])

In [11]:
def model_fit_naive(df, features):
    return RandomForestClassifier(n_estimators=30).fit(df[features], df['price'])

In [12]:
def model_eval_naive(df, features, model):
    df['price_predict'] = model.predict(df[features])
    return df

In [13]:
modelo_separa = [funcao_custo(
    model_eval(validation_listing,
               features,
               model_fit(training_listing,
                         features)),
    'price',
    'price_predict') for i in range(30)]

hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!


In [14]:
modelo_naive = [funcao_custo(
    model_eval_naive(validation_listing,
               features,
               model_fit_naive(training_listing,
                         features)),
    'price',
    'price_predict') for i in range(30)]

hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!
hop!


In [16]:
from statistics import mean

In [24]:
mean([i[0] for i in modelo_separa])

2274.5070654980905

In [25]:
mean([i[0] for i in modelo_naive])

2063.8346180421604

In [30]:
mean([i[1] for i in modelo_separa])

264.18958437677395

In [31]:
mean([i[1] for i in modelo_naive])

262.73391298394927

In [22]:
mean([i[2] for i in modelo_separa])

87

In [23]:
mean([i[2] for i in modelo_naive])

86