In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import glob
from datetime import datetime as dt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

pd.set_option("display.max_colwidth",999)
pd.set_option("display.max_rows",999)
pd.set_option("display.max_columns",999)

## for all csv's in the SF folder, concatenate into one dataframe

In [7]:
files = glob.glob('SF/*.gz')
df_sf = pd.concat([pd.read_csv(fp, parse_dates = ['calendar_last_scraped','last_scraped', 'first_review', 'last_review']) for fp in files], ignore_index=True)

  if self.run_code(code, result):


## Drop the rows where the number of reviews is greater than 0 and the reviews per month is zero

In [8]:
df_sf = df_sf[(df_sf['reviews_per_month'].notnull()) | ((df_sf['number_of_reviews']== 0) & (df_sf['reviews_per_month'].isnull()))]

In [9]:
len(df_sf)

190942

In [10]:
len(df_sf[(df_sf['availability_365']!=0) & (df_sf['reviews_per_month']!=0)])

139729

## fill in the reviews per month where 0 with 0

In [11]:
df_sf.reviews_per_month.fillna(value=0, inplace=True)

## year and month column

In [12]:
df_sf['year'] = df_sf.calendar_last_scraped.apply(lambda x: x.year)
df_sf['month'] = df_sf.calendar_last_scraped.apply(lambda x: x.month)

## Fill nulls

In [13]:
df_sf.cancellation_policy.fillna(value='none', inplace=True)
df_sf.cleaning_fee.fillna(value='$0.00', inplace=True)

df_sf.host_is_superhost.fillna(value='f', inplace=True)

df_sf['host_is_superhost'].replace('f', False, inplace=True)
df_sf['host_is_superhost'].replace('t', True, inplace=True)


df_sf.loc[df_sf.host_about.notnull(), 'host_about_filled'] = True
df_sf.loc[df_sf.host_about.isnull(), 'host_about_filled'] = False
df_sf.loc[df_sf.host_picture_url.notnull(), 'host_picture_url_filled'] = True
df_sf.loc[df_sf.host_picture_url.isnull(), 'host_picture_url_filled'] = False

In [14]:
df_sf_2017 = df_sf[df_sf['year']==2017]

In [207]:
df_sf_2017['month'].value_counts()

8     9055
1     9035
10    8933
11    8928
7     8854
6     8799
2     8776
5     8732
3     8719
4     8700
9     8527
12    6898
Name: month, dtype: int64

## 2.54 is the cutoff reviews per month to be in the top 20% of listings for all of SF 

In [9]:
df_sf.loc[df_sf.reviews_per_month >= 2.54, 'popular'] = True
df_sf.loc[df_sf.reviews_per_month < 2.54, 'popular'] = False

In [15]:
df_sf_2017.loc[df_sf_2017.reviews_per_month >= 2.57, 'popular'] = True
df_sf_2017.loc[df_sf_2017.reviews_per_month < 2.57, 'popular'] = False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [231]:
df_sf_2017.popular.value_counts(1)

False    0.799973
True     0.200027
Name: popular, dtype: float64

## Random Forest

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.preprocessing import LabelEncoder
import pickle

In [17]:
# df[df.columns[1:]].replace('[\$,]', '', regex=True).astype(float)
df_sf_2017['price'].replace('[\$,]', '', regex=True, inplace=True)
df_sf_2017['extra_people'].replace('[\$,]', '', regex=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


## Function to predict 2017

In [18]:
scores = np.zeros(shape=(4,9))
model_num = 0
start_month = 1
end_month = 4
columns_to_keep = ['accommodates','bed_type', 'extra_people', 'price','guests_included', 'host_about_filled','host_picture_url_filled', 'cancellation_policy', 'room_type', ]


In [22]:
while end_month <13:
    X_train, y_train, X_test, y_test = split_data(start_month, end_month)
    le = LabelEncoder()
    X_train = transform_data(X_train)
    X_test = transform_data(X_test)
    scores = predict_data(X_train, y_train, X_test, y_test)
    print(start_month, end_month, model_num)
    start_month += 1
    end_month += 1
    model_num += 1
    print(scores)

1 4 0
[[0.81988506 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.2336272  0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.51456311 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.32135123 0.         0.         0.         0.         0.
  0.         0.         0.        ]]
2 5 1
[[0.81988506 0.89212093 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.2336272  0.47006931 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.51456311 0.88075561 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.32135123 0.61298274 0.         0.         0.         0.
  0.         0.         0.        ]]
3 6 2
[[0.81988506 0.89212093 0.88964655 0.         0.         0.
  0.         0.         0.        ]
 [0.2336272  0.47006931 0.48648649 0.         0.         0.
  0.         0.         0.        ]
 [0.51456311 0.88075

In [19]:
def split_data(start_month, end_month):
    df_sf_temp = df_sf_2017.copy()
    X_train = df_sf_temp[(df_sf_temp['month'] >= start_month) & (df_sf_temp['month'] < end_month)][columns_to_keep]
    y_train = df_sf_temp[(df_sf_temp['month'] >= start_month) & (df_sf_temp['month'] < end_month)]['popular']

    X_test = df_sf_temp[df_sf_temp['month'] == end_month][columns_to_keep]
    y_test = df_sf_temp[df_sf_temp['month'] == end_month]['popular']
    
    return X_train, y_train, X_test, y_test
    

In [20]:
def transform_data(X):
    X['extra_people'].replace('[\=$,]', '', regex=True, inplace=True)
    X['price'].replace('[\=$,]', '', regex=True, inplace=True)
    
    X['bed_type'] = le.fit_transform(X['bed_type'])
    X['cancellation_policy'] = le.fit_transform(X['cancellation_policy'])
    X['room_type'] = le.fit_transform(X['room_type'])
 
    return X

In [21]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123456)

def predict_data(X_train, y_train, X_test, y_test):
    rf = RandomForestClassifier(n_estimators = 1000, random_state=0, class_weight = {0:.95, 1:.05})
    rf.fit(X_train, y_train)
    predicted = rf.predict(X_test)
    scores[0][model_num] = accuracy_score(y_test, predicted)
    scores[1][model_num] = recall_score(y_test, rf.predict(X_test))
    scores[2][model_num] = precision_score(y_test, rf.predict(X_test))
    scores[3][model_num] = f1_score(y_test, rf.predict(X_test))
    return scores