In [90]:
import os 
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, f_regression, SelectFromModel, mutual_info_regression
from scipy.stats import f_oneway
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler, StandardScaler 
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV
from category_encoders import TargetEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.one_hot import OneHotEncoder
from datetime import datetime

In [2]:
os.chdir('..')

In [3]:
df = pd.read_csv('ENG_DATA/CLEANED/12-8_Cleaned_df.csv', index_col = [0])

# Fix Missing Data

In [4]:
#fix a bug 
for l in ['charter', 'private', 'public', 'gsRating', 'enrollment']: 
    df[l].fillna(df[l].mean(), inplace=True)

# Add Dates

In [5]:
df["on_market_date"] = df["on_market_date"].apply(lambda x: datetime.strptime(x, "%m/%d/%y"))
df["sale_date"] = df["sale_date"].apply(lambda x: datetime.strptime(x, "%m/%d/%y"))

In [6]:
df["on_market_month_year"] = df["on_market_date"].apply(lambda x: x.strftime("%m/%Y"))
df["sale_month_year"] = df["sale_date"].apply(lambda x: x.strftime("%m/%Y"))

# Functions to Test

In [99]:
rf = RandomForestRegressor(random_state=0)
lr = LinearRegression(normalize=True)
adb = AdaBoostRegressor(random_state=0, n_estimators = 50, loss='linear', learning_rate = 0.05)
svr = LinearSVR()
mm_scaler = MinMaxScaler()
std_scaler = StandardScaler()

In [11]:
def get_score(model, x, y):
    # Split dataset into train and validation subsets:
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = r2_score(y_test, y_pred)
    return score

In [12]:
x = df.drop(columns = ['sale_price', 'orig_list_price', 'city', 'full_address'])
y = df.sale_price
x_num = x.select_dtypes(include = 'number')
x_cat = x.select_dtypes(exclude = 'number')

# Feature Highlighting & Baseline

In [13]:
# X_train.info()
x_cat.info()
# x_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23720 entries, 0 to 23719
Data columns (total 14 columns):
street_name             23720 non-null object
on_market_date          23720 non-null datetime64[ns]
sale_date               23720 non-null datetime64[ns]
zoning                  23720 non-null object
drive_side              23720 non-null object
parking                 23720 non-null object
park_leased             23720 non-null object
shopping                23720 non-null object
transportation          23720 non-null object
type                    23720 non-null object
views                   23720 non-null object
neighborhood            23720 non-null object
on_market_month_year    23720 non-null object
sale_month_year         23720 non-null object
dtypes: datetime64[ns](2), object(12)
memory usage: 2.7+ MB


In [14]:
x_cat.nunique()

street_name             1962
on_market_date          3396
sale_date               2655
zoning                    17
drive_side                16
parking                  152
park_leased                3
shopping                  14
transportation            16
type                     259
views                   2451
neighborhood             104
on_market_month_year     133
sale_month_year          120
dtype: int64

In [15]:
x_cat.head(5)

Unnamed: 0,street_name,on_market_date,sale_date,zoning,drive_side,parking,park_leased,shopping,transportation,type,views,neighborhood,on_market_month_year,sale_month_year
0,El Camino Del Mar,2013-03-14,2013-03-22,RH2,"PVDW,PVSW","ATCH,GARG",0,4BLK,1BLK,3STR,"PNRM,OCEN,PARK,GRDN",Lincoln Park / Ft. Miley,03/2013,03/2013
1,La Playa St,2017-05-18,2017-08-17,RM1,0,"ATCH,GARG,ATDR,INAC",ONST,2BLK,1BLK,0,0,Outer Sunset,05/2017,08/2017
2,48th Ave,2010-06-27,2010-08-20,RH1,"PVDW,PVSW",GARG,0,3BLK,1BLK,"ATAC,2STR,FIXR","WATR,OCEN,PARK",Sutro Heights,06/2010,08/2010
3,48th Ave,2012-06-07,2012-07-13,OTHR,PVDW,"ATCH,GARG,ATDR,INAC",0,4BLK,2BLK,3STR,"PNRM,CTYL,OCEN,PARK",Sutro Heights,06/2012,07/2012
4,48th Ave,2018-05-17,2018-06-21,OTHR,0,"ATCH,GARG,ATDR,INAC",ONST,3BLK,1BLK,2STR,"PNRM,CTYL,WATR,SFRN,OCEN,PARK,GRDN,TWNP",Sutro Heights,05/2018,06/2018


In [16]:
%%time
baseline_rf_score = get_score(rf, x.select_dtypes(include = 'number'), y)
print('Random Forest score without feature engineering:', baseline_rf_score)



Random Forest score without feature engineering: 0.8339756769498468
CPU times: user 4.11 s, sys: 58.4 ms, total: 4.16 s
Wall time: 4.23 s


In [17]:
%%time
baseline_lr_score = get_score(lr, x.select_dtypes(include = 'number'), y)
print('Linear Regression score with target encoding:', baseline_lr_score)

Logistic Regression score with target encoding: 0.657191595831697
CPU times: user 142 ms, sys: 27.7 ms, total: 170 ms
Wall time: 121 ms


# Categorical Transformation 

In [18]:
x_ct = x.copy()

## Drop & OneHot Encode variables with too many features

In [19]:
x_ct.drop(columns = ['on_market_date', 'sale_date'], inplace=True)

In [20]:
x_ct = x_ct.merge(x_ct['views'].str.get_dummies(sep=',').rename(lambda x: 'views_' + x, axis='columns'), left_index=True, right_index=True)

In [21]:
x_ct = x_ct.merge(x_ct['parking'].str.get_dummies(sep=',').rename(lambda x: 'parking_' + x, axis='columns'), left_index=True, right_index=True)

In [22]:
x_ct = x_ct.merge(x_ct['drive_side'].str.get_dummies(sep=',').rename(lambda x: 'drive_side_' + x, axis='columns'), left_index=True, right_index=True)

In [23]:
x_ct = x_ct.merge(x_ct['type'].str.get_dummies(sep=',').rename(lambda x: 'type_' + x, axis='columns'), left_index=True, right_index=True)

In [24]:
x_ct.drop(columns = ['views', 'parking', 'drive_side', 'type'], inplace=True)

In [25]:
x_ct.select_dtypes(exclude = 'number').nunique()

street_name             1962
zoning                    17
park_leased                3
shopping                  14
transportation            16
neighborhood             104
on_market_month_year     133
sale_month_year          120
dtype: int64

In [26]:
%%time
dummy1_rf_score = get_score(rf, x_ct.select_dtypes(include = 'number'), y)
print('Random Forest score with feature engineering/extraction 1:', dummy1_rf_score)

Random Forest score with feature engineering/extraction 1: 0.8192856563176838
CPU times: user 6.01 s, sys: 113 ms, total: 6.13 s
Wall time: 6.37 s


In [27]:
%%time
dummy1_lr_score = get_score(lr, x_ct.select_dtypes(include = 'number'), y)
print('Linear Regression score with target encoding:', dummy1_lr_score)

Linear Regression score with target encoding: 0.6834762377958432
CPU times: user 229 ms, sys: 38.8 ms, total: 268 ms
Wall time: 194 ms


# Target Encoding

In [31]:
ce_target = TargetEncoder(cols = ['street_name', 'neighborhood'])
x_ct2 = ce_target.fit_transform(x_ct, y)

In [32]:
%%time
dummy2_rf_score = get_score(rf, x_ct2.select_dtypes(include = 'number'), y)
print('Random Forest score with feature engineering/extraction 1:', dummy2_rf_score)

Random Forest score with feature engineering/extraction 1: 0.8324825851409321
CPU times: user 6.43 s, sys: 72.4 ms, total: 6.5 s
Wall time: 6.41 s


In [33]:
%%time
dummy2_lr_score = get_score(lr, x_ct2.select_dtypes(include = 'number'), y)
print('Linear Regression score with target encoding:', dummy2_lr_score)

Linear Regression score with target encoding: 0.7363665351112405
CPU times: user 263 ms, sys: 37.7 ms, total: 300 ms
Wall time: 215 ms


# Ordinal Encoding

In [35]:
oe_target = OrdinalEncoder(cols = ['on_market_month_year', 'sale_month_year'])
x_ct3 = oe_target.fit_transform(x_ct2, y)

In [36]:
%%time
dummy3_rf_score = get_score(rf, x_ct3.select_dtypes(include = 'number'), y)
print('Random Forest score with feature engineering/extraction 3:', dummy3_rf_score)

Random Forest score with feature engineering/extraction 1: 0.824236186093312
CPU times: user 6.82 s, sys: 125 ms, total: 6.95 s
Wall time: 7.12 s


In [37]:
%%time
dummy3_lr_score = get_score(lr, x_ct3.select_dtypes(include = 'number'), y)
print('Linear Regression score with ordinal encoding:', dummy3_lr_score)

Linear Regression score with target encoding: 0.7365291373517966
CPU times: user 263 ms, sys: 39.9 ms, total: 303 ms
Wall time: 210 ms


# Final One Hot Encoding

In [46]:
oh_target = OneHotEncoder(cols = ['zoning', 'park_leased', 'shopping', 'transportation'])
x_ct4 = oh_target.fit_transform(x_ct3, y)

In [47]:
%%time
dummy4_rf_score = get_score(rf, x_ct4.select_dtypes(include = 'number'), y)
print('Random Forest score with feature engineering/extraction 4:', dummy4_rf_score)

Random Forest score with feature engineering/extraction 4: 0.8326139023193551
CPU times: user 7.46 s, sys: 101 ms, total: 7.56 s
Wall time: 7.7 s


In [48]:
%%time
dummy4_lr_score = get_score(lr, x_ct4.select_dtypes(include = 'number'), y)
print('Linear Regression score with all encoding:', dummy4_lr_score)

Linear Regression score with all encoding: 0.7377151477542421
CPU times: user 383 ms, sys: 49.7 ms, total: 433 ms
Wall time: 292 ms


# Scaler - problem now with the r2 linear regression WHY NEGATIVE?

In [72]:
x_sc = x_ct4.copy()

In [78]:
num_columns = list(x.select_dtypes(include = 'number'))

In [79]:
x_sc[num_columns] = mm_scaler.fit_transform(x_sc[num_columns])
y[num_columns] = mm_scaler.fit_transform(x_sc[num_columns])

In [80]:
%%time
mmscaler_rf_score = get_score(rf, x_sc, y)
print('Random Forest score with MinMaxScaler:', mmscaler_rf_score)

Random Forest score with MinMaxScaler: 0.8320096500962244
CPU times: user 7.12 s, sys: 39.3 ms, total: 7.16 s
Wall time: 7.19 s


In [100]:
%%time
mmscaler_lr_score = get_score(lr, x_sc, y)
print('Linear Regression score with MinMaxScaler:', mmscaler_lr_score)

Linear Regression score with MinMaxScaler: -5.84193355948447e+24
CPU times: user 343 ms, sys: 70.9 ms, total: 414 ms
Wall time: 249 ms


# Compare AdaBoost and SVR Performance

In [58]:
%%time
svr_score = get_score(svr, x_ct4.select_dtypes(include = 'number'), y)
print('Support Vector Regressor score:', svr_score)

Support Vector Regressor score: 0.4637721435634795
CPU times: user 11 s, sys: 159 ms, total: 11.2 s
Wall time: 11.6 s




In [60]:
X_tr, X_t, y_tr, y_t = train_test_split(x_ct4, y, test_size=0.2, random_state=0)

param_dist = {
 'n_estimators': [50, 100],
 'learning_rate' : [0.01,0.05,0.1,0.3,1],
 'loss' : ['linear', 'square', 'exponential']
 }

pre_gs_inst = RandomizedSearchCV(AdaBoostRegressor(),
 param_distributions = param_dist,
 cv=3,
 n_iter = 10,
 n_jobs=-1)

pre_gs_inst.fit(X_tr, y_tr)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=AdaBoostRegressor(base_estimator=None,
                                               learning_rate=1.0, loss='linear',
                                               n_estimators=50,
                                               random_state=None),
                   iid='warn', n_iter=10, n_jobs=-1,
                   param_distributions={'learning_rate': [0.01, 0.05, 0.1, 0.3,
                                                          1],
                                        'loss': ['linear', 'square',
                                                 'exponential'],
                                        'n_estimators': [50, 100]},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=0)

In [61]:
pre_gs_inst.best_params_

{'n_estimators': 50, 'loss': 'linear', 'learning_rate': 0.05}

In [63]:
%%time
adb_score = get_score(adb, x_ct4.select_dtypes(include = 'number'), y)
print('AdaBoost Regressor score:', adb_score)

AdaBoost Regressor score: 0.7257814554773282
CPU times: user 15.1 s, sys: 515 ms, total: 15.6 s
Wall time: 16.1 s


# Recursive Feature Selection

In [93]:
rfe = RFE(rf, step=3)
rfe.fit(x_ct4, y)

RFE(estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                    max_depth=None, max_features='auto',
                                    max_leaf_nodes=None,
                                    min_impurity_decrease=0.0,
                                    min_impurity_split=None, min_samples_leaf=1,
                                    min_samples_split=2,
                                    min_weight_fraction_leaf=0.0,
                                    n_estimators=10, n_jobs=None,
                                    oob_score=False, random_state=0, verbose=0,
                                    warm_start=False),
    n_features_to_select=None, step=3, verbose=0)

In [120]:
x_rfe = x_ct4[x_ct4.columns[rfe.support_]]

In [121]:
%%time
rfe_rf_score = get_score(rf, x_rfe, y)
print('Random Forest score with RFE:', rfe_rf_score)



Random Forest score with RFE: 0.8301375187319064
CPU times: user 4.97 s, sys: 51.8 ms, total: 5.02 s
Wall time: 5.05 s


# Export

In [124]:
def export_train_test(df_, name_):
    train, test = train_test_split(df_ , test_size = 0.2)
    month_day = datetime.now().strftime('%m-%d')
    train.to_csv('ENG_DATA/SELECTED/{}_{}_train.csv'.format(month_day, name_))
    test.to_csv('ENG_DATA/SELECTED/{}_{}_test.csv'.format(month_day, name_))

In [88]:
df_out = x_ct4.merge(y, left_index = True, right_index = True)

In [89]:
df_out.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23720 entries, 0 to 23719
Columns: 176 entries, longitude to sale_price
dtypes: float64(52), int64(124)
memory usage: 32.7 MB


In [125]:
export_train_test(df_out, "cat_transformed")

In [138]:
export_train_test(x_rfe.merge(y, left_index = True, right_index = True), "rfe-87")