In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics, preprocessing
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
%matplotlib inline

import os
print(os.listdir("../input"))

In [6]:
data = pd.read_csv('../input/housedata/data.csv')
data.head(1)

In [8]:
data.describe().round()

In [9]:
data['price'].describe().round()

In [10]:
data = data[data['price'].between(300000, 700000)]
data = data[data['yr_built'] > 1930]
data.head(1)

In [13]:
data['month'] = pd.DatetimeIndex(data['date']).month
data.head(1)

In [15]:
cols = ['country', 'street', "statezip"]
data = data.drop(columns=cols, axis=1)

In [17]:
cols = ['date']
data = data.drop(columns=cols, axis=1)

In [19]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [20]:
labelencoder_X = LabelEncoder()
data.city = labelencoder_X.fit_transform(data.city)

In [21]:
ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [-1])],    # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'                         # Leave the rest of the columns untouched
)

In [22]:
df_scaled = pd.DataFrame(preprocessing.scale(data), columns=data.columns)
df_scaled.head()

In [23]:
plt.figure(figsize=(50,50))
sns.pairplot(df_scaled,y_vars='price',x_vars=df_scaled.columns[:-1])

In [26]:
plt.figure(figsize=(14, 8))
corr_matrix = data.corr().round(2)
sns.heatmap(data=corr_matrix,cmap='coolwarm',annot=True)

In [27]:
X = df_scaled.drop('price',axis=1)
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [29]:
linmodel = LinearRegression()
linmodel.fit(X_train,y_train)
linpred = linmodel.predict(X_test)

In [30]:
plt.scatter(y_test,linpred)
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')

In [31]:
from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, linpred))
print('MSE:', metrics.mean_squared_error(y_test, linpred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, linpred)))

In [32]:
from sklearn.linear_model import Lasso
alpha_ridge = [-3,-2,-1,1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1,1.5, 2,3,4, 5, 10, 20, 30, 40]
temp_mae = {}
temp_mse = {}
temp_rmse = {}
for i in alpha_ridge:
    lasso_reg = Lasso(alpha=i, normalize=True) 
    lasso_reg.fit(X_train, y_train)
    lasso_pred = lasso_reg.predict(X_test)
    mae = metrics.mean_absolute_error(y_test, lasso_pred)
    mse = metrics.mean_squared_error(y_test, lasso_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, lasso_pred))
    temp_mae[i] = mae
    temp_mse[i] = mse
    temp_rmse[i] = rmse

In [33]:
print(temp_mae)
print(temp_mse)
print(temp_rmse)

In [34]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100)
forest_fit = forest_reg.fit(X_train,y_train)
forest_pred = forest_fit.predict(X_test)

In [35]:
plt.scatter(y_test,forest_pred)
plt.xlabel('Y Test')
plt.ylabel('Predicted Y by Random Forest Regression')

In [36]:
plt.scatter(y_test,linpred)
plt.xlabel('Y Test')
plt.ylabel('Predicted Y by Linear Regression')

In [37]:
print('Linear Regression metrics')
print('MAE:', metrics.mean_absolute_error(y_test, linpred))
print('MSE:', metrics.mean_squared_error(y_test, linpred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, linpred)))
print('')
print('Random Forest Regression metrics')
print('MAE:', metrics.mean_absolute_error(y_test, forest_pred))
print('MSE:', metrics.mean_squared_error(y_test, forest_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, forest_pred)))