In [1]:
import random
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from datetime import datetime as dt

pd.set_option("display.max_colwidth",999)
pd.set_option("display.max_rows",999)
pd.set_option("display.max_columns",999)
np.set_printoptions(suppress=True)

In [2]:
df_sf_2017 = pd.read_json('df_sf_2017')

## Scenario A - Guess that every listing is not popular

### Accuracy = 80%

## Senario B Baseline -  randomly guessing given that we know that 20% of the data is popular

In [244]:
def randomly_guessing(n_simulations = 10):
    
    count = 0 #
    #keep track of 
    accuracy_baseline = np.zeros(shape=(1,9*n_simulations)) #we have 9 test sets for 2017. Multiply by 9 for each simulation.  
    recall_baseline = np.zeros(shape=(1,9*n_simulations))
    precision_baseline = np.zeros(shape=(1,9*n_simulations))
    f1_baseline = np.zeros(shape=(1,9*n_simulations))
    
    for i in range(n_simulations):
        my_list = [True for x in range(2)] +  [False for x in range(8)]
        start_month = 1
        end_month = 4
        while end_month <13:
            y_test = df_sf_2017[df_sf_2017['month'] == end_month]['popular']
            y_pred = pd.Series(random.choice(my_list) for x in range(y_test.size))
            accuracy_baseline[0][count] = accuracy_score(y_test, y_pred)
            recall_baseline[0][count] = recall_score(y_test, y_pred)
            precision_baseline[0][count] = precision_score(y_test, y_pred)
            f1_baseline[0][count] = f1_score(y_test, y_pred)
            count+=1
            start_month += 1
            end_month += 1
            
    return accuracy_baseline, recall_baseline, precision_baseline, f1_baseline


In [235]:
accuracy_baseline, recall_baseline, precision_baseline, f1_baseline = randomly_guessing(n_simulations = 1000)
print(accuracy_baseline.mean())
print(recall_baseline.mean()) 
print(precision_baseline.mean()) 
print(f1_baseline.mean())

0.6738568003192421
0.19992382244534446
0.2098987145097955
0.20396154681655437


## Scenario C Baseline - everything that was popular before will be popular next month

In [69]:
baseline_score = np.zeros(shape=(4,8))
model_num = 0
end_month = 4
while end_month <12:
    print('round')
    month_1 = df_sf_2017[df_sf_2017['month']==end_month][['host_id', 'id', 'popular']]
    month_2 = df_sf_2017[df_sf_2017['month']==(end_month+1)][['host_id', 'id', 'popular']]
    new_df = pd.merge(month_1, month_2,  how='outer', left_on=['host_id','id'], right_on = ['host_id','id'])
    new_df.popular_x.fillna(value=False, inplace=True)
    new_df.popular_y.fillna(value=False, inplace=True)
    TP = sum((new_df['popular_x']==True) & (new_df['popular_y']==True) )
    FP = sum((new_df['popular_x']==True) & (new_df['popular_y']==False) )
    FN = sum((new_df['popular_x']==False) & (new_df['popular_y']==True) )
    TN = sum((new_df['popular_x']==False) & (new_df['popular_y']==False) )
    baseline_score[0][model_num] = (TP + TN) / (TP+TN+FP+FN) #accuracy
    baseline_score[1][model_num] = TP/(TP+FN) #recall
    baseline_score[2][model_num] = TP/(TP+FP) #precision
    baseline_score[3][model_num] = 2 * (precision * recall) / (precision + recall) #F1
    end_month+=1
    model_num+=1
print(baseline_score)

round
round
round
round
round
round
round
round
[[0.9608849  0.96217545 0.96382866 0.95824678 0.9580182  0.9485442
  0.96311855 0.96487046]
 [0.87884741 0.86542056 0.88295937 0.85087226 0.88636364 0.86368653
  0.89997237 0.93271632]
 [0.88115561 0.90962672 0.90716511 0.91691935 0.89983118 0.86751663
  0.93270332 0.92107936]
 [0.88       0.88       0.88       0.88       0.88       0.88
  0.88       0.88      ]]


In [70]:
print(baseline_score[0].mean()) #accuracy
print(baseline_score[1].mean()) #recal
print(baseline_score[2].mean()) #precision
print(baseline_score[3].mean()) #f1


0.9599608992257598
0.8826048074968106
0.904499658833364
0.88
