In [None]:
import itertools
import numpy as np
import pandas as pd 
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('ticks')

import pickle

from sklearn import metrics
from sklearn.metrics import mean_squared_error, roc_curve, roc_auc_score, accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [None]:
df = pd.read_csv('kc_house_data_train.csv', index_col=0)
df_test_features = pd.read_csv('kc_house_data_test_features.csv', index_col=0)

In [None]:
df_untouched = pd.read_csv('kc_house_data_train.csv', index_col=0)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
sns.set_style('ticks')
df.hist('price', bins=20)

In [None]:
df['log_price'] = df['price'].apply(lambda x: np.log(x))
df.hist('log_price', bins=20)

In [None]:
df.boxplot('log_price', 'grade')

In [None]:
df.boxplot('log_price', 'condition')

In [None]:
df.boxplot('log_price', 'view')

In [None]:
df.boxplot('log_price', 'waterfront')

In [None]:
# df['view_grade'] = df['view'] + df['grade']
# df.boxplot('log_price', 'view_grade')

In [None]:
df.boxplot('log_price', 'bedrooms')

In [None]:
df['bedrooms'].value_counts()

In [None]:
conditions = [df['bedrooms'] == 0, df['bedrooms'] > 7]
choices = [1, 7]

df['bedrooms'] = np.select(conditions, choices, df['bedrooms'])

In [None]:
df['bedrooms'].value_counts()

In [None]:
df.boxplot('log_price', 'bedrooms')

In [None]:
df.boxplot('log_price', 'bathrooms')

In [None]:
df['bathrooms'].unique()

In [None]:
conditions = [df['bathrooms'] <= 1, df['bathrooms'] <= 1.75,
              df['bathrooms'] <= 2, df['bathrooms'] <= 2.75,
              df['bathrooms'] <= 3, df['bathrooms'] <= 3.75,
              df['bathrooms'] <= 4, df['bathrooms'] <= 4.75]
choices = [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5]

df['bathrooms'] = np.select(conditions, choices, 5)

In [None]:
df['bathrooms'].value_counts()

In [None]:
df.boxplot('log_price', 'bathrooms')

In [None]:
df['log_sqft_living'] = df['sqft_living'].apply(lambda x: np.log(x))
df.hist('log_sqft_living')

In [None]:
sns.regplot(df['log_sqft_living'], df['log_price'])

In [None]:
df.boxplot('log_price', 'floors')

In [None]:
df.columns

In [None]:
columns = ['bedrooms', 'bathrooms', 'floors', 'view', 'condition', 'grade', 'zipcode']
df = pd.get_dummies(data=df, columns=columns, prefix=columns, drop_first=True)
df.columns

In [None]:
floor_dict = {
            'floors_1.5': 'floors_1_half',
            'floors_2.0': 'floors_2',
            'floors_2.5': 'floors_2_half',
            'floors_3.0': 'floors_3',
            'floors_3.5': 'floors_3_half'
}

df = df.rename(columns=floor_dict)

In [None]:
bath_dict = {
            'bathrooms_1.0': 'bathrooms_1',
            'bathrooms_1.5': 'bathrooms_1_half',
            'bathrooms_2.0': 'bathrooms_2',
            'bathrooms_2.5': 'bathrooms_2_half',
            'bathrooms_3.0': 'bathrooms_3',
            'bathrooms_3.5': 'bathrooms_3_half',
            'bathrooms_4.0': 'bathrooms_4',
            'bathrooms_4.5': 'bathrooms_4_half',
            'bathrooms_5.0': 'bathrooms_5'
}

df = df.rename(columns=bath_dict)

In [None]:
df.shape

In [None]:
df_untouched.columns

In [None]:
untouched_cats = '+'.join(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront',
                           'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built',
                           'yr_renovated', 'sqft_living15', 'sqft_lot15'])

In [None]:
f_untouched = 'price~' + untouched_cats
model_untouched = ols(formula=f_untouched, data=df_untouched).fit()
model_untouched.summary()

In [None]:
df.columns[:20]

# FEATURES CREATION

In [None]:
features = df.drop(['id', 'date', 'price', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'yr_built',
                    'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 
#                     'log_living_bed_ratio',
#                     'view_grade',
#                     'since_built_reno'                    
                   ], 
                    axis = 1)

In [None]:
features.columns

In [None]:
features_list = list(features.columns)
features_list.remove('log_price')
f_list = '+'.join(features_list)

In [None]:
f = 'log_price~' + f_list
model = ols(formula=f, data=features).fit()
model.summary()

In [None]:
def ext_values(df, extreme_cols):
    new_df = df.copy()
    for col in extreme_cols:
        std = new_df[col].std()
        mean = new_df[col].mean()
        value = mean+(4*std)
        new_df[col] = new_df[col].apply(lambda x: value if (x > 4*std) else x)
    return new_df

In [None]:
X_train.describe()

In [None]:
features_test = ext_values(features, features_only.columns)
features_test.describe()

# TRAIN-TEST SPLIT!!!

In [None]:
target = features['log_price']
features_only = features.drop(['log_price'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(features_only, target, random_state=22, test_size=0.2)
# random_state=22,

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

colors = ['yellow', 'lightgreen', 'blue']
plt.figure(figsize=(10, 6))
plt.scatter(features_only['log_sqft_living'], target, color='green', s=50, marker='.', label='plot points')

# We'll fit 3 different polynomial regression models from degree 2 to degree 4
for index, degree in enumerate([2, 3, 4]):
    
    # Instantiate PolynomialFeatures
    poly = PolynomialFeatures(degree)
    
    # Fit and transform X_train
    X_poly_train = poly.fit_transform(X_train)
    
    # Instantiate and fit a linear regression model to the polynomial transformed train features
    reg_poly = LinearRegression().fit(X_poly_train, y_train)
    
    # Transform the test data into polynomial features
    X_poly_test = poly.transform(X_test)
    
    # Get predicted values for transformed polynomial test data  
    y_pred = reg_poly.predict(X_poly_test)
    
    # Evaluate model performance on test data
    print("degree %d" % degree, r2_score(y_test, y_pred))
    
    # Transform the full data
    X_poly = poly.transform(features_only)
    
    # Now, we want to see what the model predicts for the entire data  
    y_poly = reg_poly.predict(X_poly)
    
    # Create plot of predicted values
    plt.plot(X, y_poly, color = colors[index], linewidth=2, label='degree %d' % degree)
    plt.legend(loc='lower left')

In [None]:
poly = PolynomialFeatures(3)
X_train = poly.fit_transform(X_train)
X_test = poly.fit_transform(X_test)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train =pd.DataFrame(data=scaler.transform(X_train), columns=features_list)
X_test =pd.DataFrame(data=scaler.transform(X_test), columns=features_list)

## "Standard" model

In [None]:
#instantiate a linear regression object
lm = LinearRegression()

#fit the linear regression to the data
lm = lm.fit(X_train, y_train)

y_train_pred = np.exp(lm.predict(X_train))

train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))


print('Training Root Mean Squared Error:' , train_rmse)

y_pred = np.exp(lm.predict(X_test))

test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

print('Testing Root Mean Squared Error:' , test_rmse)


print('Training: ', train_rmse, "vs. Testing: ", test_rmse)

## F-Test

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression

selector = SelectKBest(f_regression, k=10)

selector.fit(X_train, y_train)

selected_columns = X_train.columns[selector.get_support()]
removed_columns = X_train.columns[~selector.get_support()]
# X_train = X_train[selected_columns]
# X_test = X_test[selected_columns]

In [None]:
selected_columns

In [None]:
#instantiate a linear regression object
lm_kbest = LinearRegression()

#fit the linear regression to the data
lm_kbest = lm_kbest.fit(X_train[selected_columns], y_train)

y_train_kbest = np.exp(lm_kbest.predict(X_train[selected_columns]))


trainK_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_kbest))


print('Training Root Mean Squared Error:' , trainK_rmse)

y_kbest = np.exp(lm_kbest.predict(X_test[selected_columns]))

testK_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_kbest))

print('Testing Root Mean Squared Error:' , testK_rmse)


print('Original: ', test_rmse, "vs. KBest: ", testK_rmse)

In [None]:
f_list_selected = '+'.join(features[selected_columns].columns)
f_selected = 'log_price~' + f_list_selected
model_selected = ols(formula=f_selected, data=features).fit()
model_selected.summary()

## Recursive Feature Elimination

In [None]:
from sklearn import linear_model
from sklearn.feature_selection import RFECV

rfe = linear_model.LinearRegression()
# Create recursive feature eliminator that scores features by mean squared errors
selector = RFECV(estimator=rfe, step=1, cv=10, scoring='neg_mean_squared_error')
# Fit recursive feature eliminator 
selector.fit(X_train, y_train)

selected_rfe = X_train.columns[selector.support_]
removed_rfe = X_train.columns[~selector.support_]

In [None]:
selected_rfe

In [None]:
#instantiate a linear regression object
lm_rfe = LinearRegression()

#fit the linear regression to the data
lm_rfe = lm_rfe.fit(X_train[selected_rfe], y_train)

y_rfe = np.exp(lm_rfe.predict(X_train[selected_rfe]))


trainRFE_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_rfe))


print('Training Root Mean Squared Error:' , trainRFE_rmse)

y_pred_rfe = np.exp(lm_rfe.predict(X_test[selected_rfe]))

testRFE_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred_rfe))

print('Testing Root Mean Squared Error:' , testRFE_rmse)


print('Original: ', test_rmse, "vs. KBest: ", testK_rmse, "vs. RFE: ", testRFE_rmse)

## LASSO

In [None]:
## training the model
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=.01, normalize=False)

lasso.fit(X_train,y_train)

y_train_pred_lasso = np.exp(lasso.predict(X_train))
y_pred_lasso = np.exp(lasso.predict(X_test))

train_rmse = metrics.mean_absolute_error(y_train, y_train_pred_lasso)
test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred_lasso))
print('Training Error: '+ str(train_rmse) )
print('Testing Error: '+ str(test_rmse) )

lasso_coef01 = pd.DataFrame(data=lasso.coef_).T
lasso_coef01.columns = X_train.columns
lasso_coef01 = lasso_coef01.T.sort_values(by=0).T
lasso_coef01.plot(kind='bar', title='Modal Coefficients', legend=False, figsize=(16,8))

In [None]:
coefficient = lasso_coef01.T
coefficient[coefficient[0] != 0]

In [None]:
sns.set(style="white")

# Compute the correlation matrix
corr = X_train.iloc[:,:44].corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(22, 18))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
sns.residplot( y_train, y_train_pred,lowess=True, color="g")

In [None]:
sns.residplot( y_test, y_pred, lowess=True, color="g")

In [None]:
features['log_sqft_living'].corr(features['log_living_bed_ratio'])

In [None]:
features['log_sqft_living'].corr(features['view_grade'])

# FITTING TO FULL DATA

In [None]:
final_scaler = StandardScaler()
final_scaler.fit_transform(features[selected_columns])

In [None]:
final_model = LinearRegression()
final_model.fit(features[selected_columns], target)

## POLYNOMIALS??

In [None]:
def create_polynomial_regression_model(degree):
  "Creates a polynomial regression model for the given degree"
  
  poly_features = PolynomialFeatures(degree=degree)
  
  # transforms the existing features to higher degree features.
  X_train_poly = poly_features.fit_transform(X_train)
  
  # fit the transformed features to Linear Regression
  poly_model = LinearRegression()
  poly_model.fit(X_train_poly, y_train)
  
  # predicting on training data-set
  y_train_predicted = poly_model.predict(X_train_poly)
  
  # predicting on test data-set
  y_test_predict = poly_model.predict(poly_features.fit_transform(X_test))
  
  # evaluating the model on training dataset
  rmse_train = np.sqrt(mean_squared_error(Y_train, y_train_predicted))
  r2_train = r2_score(y_train, y_train_predicted)
  
  # evaluating the model on test dataset
  rmse_test = np.sqrt(mean_squared_error(Y_test, y_test_predict))
  r2_test = r2_score(y_test, y_test_predict)
  
  print("The model performance for the training set")
  print("-------------------------------------------")
  print("RMSE of training set is {}".format(rmse_train))
  print("R2 score of training set is {}".format(r2_train))
  
  print("\n")
  
  print("The model performance for the test set")
  print("-------------------------------------------")
  print("RMSE of test set is {}".format(rmse_test))
  print("R2 score of test set is {}".format(r2_test))

In [None]:
x_poly = create_polynomial_regression_model(3)

In [None]:
polynomial_features= PolynomialFeatures(degree=3)
x_poly = polynomial_features.fit_transform(X_train[['']])

model = LinearRegression()
model.fit(x_poly, y_train)
y_poly_pred = model.predict(x_poly)

rmse = np.sqrt(mean_squared_error(y_train, y_poly_pred))
r2 = r2_score(y,y_poly_pred)
print(rmse)
print(r2)

plt.scatter(x, y, s=10)
# sort the values of x before line plot
sort_axis = operator.itemgetter(0)
sorted_zip = sorted(zip(X_train,y_poly_pred), key=sort_axis)
X_train, y_poly_pred = zip(*sorted_zip)
plt.plot(x, y_poly_pred, color='m')
plt.show()

In [None]:
features.columns[0:20]

In [None]:
features.columns[20:46]