In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('ticks')

import pickle

from sklearn import metrics
from sklearn.metrics import mean_squared_error, roc_curve, roc_auc_score, accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [3]:
df = pd.read_csv('kc_house_data_test_features.csv', index_col=0)

In [4]:
df.head()

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,1974300020,20140827T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
1,1974300020,20150218T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
2,3630020380,20141107T000000,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,98029,47.5472,-121.998,1470,1576
3,1771000290,20141203T000000,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,98077,47.7427,-122.071,1160,10565
4,5126310470,20150115T000000,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,98059,47.4863,-122.14,2830,7916


In [5]:
def find_extremas(df):
    extrema_list = []
    for column in list(df.columns):
        if df[column]['max'] > ( df[column]['mean']+5*df[column]['std']):
            extrema_list.append(column)
    return extrema_list

descriptor = df.describe()
extreme_cols = find_extremas(descriptor)
extreme_cols

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'waterfront',
 'view',
 'sqft_above',
 'sqft_basement',
 'yr_renovated',
 'long',
 'sqft_living15',
 'sqft_lot15']

In [6]:
def extreme(df,columns):
    for column in columns:
        mean = df[column].mean()
        std = df[column].std()
        conditions = [
            df[column] > mean + 4*std
        ]
        df[column] = np.where(df[column] > mean + 5*std, mean + 5*std, df[column])

In [7]:
extreme(df, ['sqft_living',
 'sqft_lot', 'sqft_above',
 'sqft_basement', 'sqft_living15',
 'sqft_lot15'])

# Transformations

In [8]:
df['waterfront_view'] = df['waterfront'] * df['view']
df['waterfront_cond'] = df['waterfront'] * df['condition']
df['waterfront_grade'] = df['waterfront'] * df['grade']
df['view_grade'] = df['view'] + df['grade']
conditions = [df['bedrooms'] == 0, df['bedrooms'] > 7]
choices = [1, 7]

df['bedrooms'] = np.select(conditions, choices, df['bedrooms'])
conditions = [df['bathrooms'] <= 1, df['bathrooms'] <= 1.75,
              df['bathrooms'] <= 2, df['bathrooms'] <= 2.75,
              df['bathrooms'] <= 3, df['bathrooms'] <= 3.75,
              df['bathrooms'] <= 4, df['bathrooms'] <= 4.75]
choices = [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5]

df['bathrooms'] = np.select(conditions, choices, 5)

df['bed_bath_ratio'] = df['bedrooms'] / df['bathrooms']

df['living_bed_ratio'] = df['sqft_living'] / df['bedrooms']
df['living_bed_ratio'] = np.where(df['living_bed_ratio'] == np.inf,
                                  df['living_bed_ratio'].min(), df['living_bed_ratio'])

df['log_living_bed_ratio'] = df['living_bed_ratio'].apply(lambda x: np.log(x))

df['log_sqft_lot'] = df['sqft_lot'].apply(lambda x: np.log(x))

df['sqft_yard'] = df['sqft_lot'] - (df['sqft_living'] / df['floors'])
df['sqft_yard'] = np.where(df['sqft_yard'] < 0, df['sqft_yard'].median(), df['sqft_yard'])
df['sqft_yard'].min(), df['sqft_yard'].max()

df['log_sqft_yard'] = df['sqft_yard'].apply(lambda x: np.log(x))

df['log_sqft_living'] = df['sqft_living'].apply(lambda x: np.log(x))

df['yr_built_reno'] = np.where(df['yr_renovated'] == 0, df['yr_built'], df['yr_renovated'])

df['since_built_reno'] = df['yr_built_reno'].apply(lambda x: 2015 - x)

df['log_sqft_living15'] = df['sqft_living15'].apply(lambda x: np.log(x))

df['log_sqft_basement'] = df['sqft_basement'].apply(lambda x: np.log(x))

df['has_basement'] = np.where(df['sqft_basement'] == 0, 0, 1)

columns = ['bedrooms', 'bathrooms', 'floors', 'view', 'condition', 'grade', 'zipcode','view_grade']
df = pd.get_dummies(data=df, columns=columns, prefix=columns, drop_first=True)
df.columns

floor_dict = {
            'floors_1.5': 'floors_1_half',
            'floors_2.0': 'floors_2',
            'floors_2.5': 'floors_2_half',
            'floors_3.0': 'floors_3',
            'floors_3.5': 'floors_3_half'
}

df = df.rename(columns=floor_dict)

bath_dict = {
            'bathrooms_1.0': 'bathrooms_1',
            'bathrooms_1.5': 'bathrooms_1_half',
            'bathrooms_2.0': 'bathrooms_2',
            'bathrooms_2.5': 'bathrooms_2_half',
            'bathrooms_3.0': 'bathrooms_3',
            'bathrooms_3.5': 'bathrooms_3_half',
            'bathrooms_4.0': 'bathrooms_4',
            'bathrooms_4.5': 'bathrooms_4_half',
            'bathrooms_5.0': 'bathrooms_5'
}

df = df.rename(columns=bath_dict)

In [9]:
#dropping same columns
y_predict = df.drop(['id', 'date','sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'yr_built',
                    'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'living_bed_ratio',
                    'bed_bath_ratio', 'log_sqft_lot', 'sqft_yard', 'yr_built_reno', 'log_sqft_living15',
                    'log_sqft_basement',
                    'waterfront_view', 'waterfront_cond', 'waterfront_grade'
#                     'log_living_bed_ratio',
#                     'view_grade',
#                     'since_built_reno'                    
                   ],axis = 1)

In [10]:
# need a view_grade_3 and view_grade_3 and since it's missing set them all to 0
y_predict['view_grade_3'] = 0
y_predict['grade_3'] = 0

# Getting pickled scalers and models

In [11]:
with open('final_scaler.pickle', 'rb') as file:
    final_scaler = pickle.load(file)
with open('standard_regression_model.pickle', 'rb') as file:
    standard_regression = pickle.load(file)
with open('lasso_full.pickle', 'rb') as file:
    lasso_full = pickle.load(file)
with open('kbest_regression.pickle', 'rb') as file:
    kbest_regression = pickle.load(file)

In [26]:
scaler = StandardScaler()
scaled_holdout2 = scaler.fit
scaled_holdout2

<bound method StandardScaler.fit of StandardScaler(copy=True, with_mean=True, with_std=True)>

In [12]:
scaled_holdout = final_scaler.transform(y_predict)
scaled_holdout

array([[ 3.33901662e-17,  6.34124075e+00,  9.13021433e+00, ...,
         5.13694864e-18, -4.10955891e-18, -2.56847432e-18],
       [ 3.33901662e-17,  6.34124075e+00,  9.13021433e+00, ...,
         5.13694864e-18, -4.10955891e-18, -2.56847432e-18],
       [ 3.33901662e-17,  6.19440539e+00,  6.95081477e+00, ...,
         5.13694864e-18, -4.10955891e-18, -2.56847432e-18],
       ...,
       [ 3.33901662e-17,  6.23441073e+00,  6.73340189e+00, ...,
         5.13694864e-18, -4.10955891e-18, -2.56847432e-18],
       [ 3.33901662e-17,  6.27914662e+00,  7.37023064e+00, ...,
         5.13694864e-18, -4.10955891e-18, -2.56847432e-18],
       [ 3.33901662e-17,  6.23441073e+00,  6.33859408e+00, ...,
         5.13694864e-18, -4.10955891e-18, -2.56847432e-18]])

In [30]:
standard_predictions = np.exp(standard_regression.predict(scaled_holdout))
standard_predictions

array([570187.55671397, 570187.55671397, 289916.74025115, ...,
       330061.95252021, 320546.70376475, 322795.70903198])

In [13]:
with open('k_selector.pickle', 'rb') as file:
    k_selector = pickle.load(file)

In [14]:
kbest_columns = y_predict.columns[k_selector.get_support()]
kbest_columns

Index(['log_living_bed_ratio', 'log_sqft_living', 'bathrooms_3_half',
       'grade_7', 'grade_8', 'grade_10', 'grade_11', 'view_grade_8',
       'view_grade_9'],
      dtype='object')

In [15]:
#couldnt get kbest to work because of a reshaping error?
kbest_predictions = np.exp(kbest_regression.predict(scaled_holdout[kbest_columns]))
kbest_predictions

  


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
pd.DataFrame(kbest_predictions).to_csv('predictions_RM_JS.csv')

In [36]:
lasso_holdout = np.exp(lasso_full.predict(scaled_holdout))
lasso_holdout

array([519134.01628006, 519134.01628006, 392483.29725221, ...,
       316934.78897734, 412453.74535265, 317177.70786823])

In [None]:
#predicting using lasso and saving to csv
pd.DataFrame(lasso_holdout).to_csv('predictions_RM_JS_LASSO.csv')