In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# importing the necessary libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [121]:
# Reading the training dataset
og_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
og_data.head(20)

In [122]:
#Checking the shape
og_data.shape

In [123]:
#Checking the number of null values
og_data.isnull().sum()

In [124]:
#Checking the null percentage
(og_data.isnull().sum().sum())/(og_data.shape[0]*og_data.shape[1])

In [138]:
#Dropping all the the columns which have null values
data = og_data.copy()
data.dropna(axis=1, how='any',inplace=True)

In [139]:
#Checking the number of null values
data.isnull().sum()

In [140]:
#Shape after dropping
data.shape

In [149]:
data.info()

In [157]:
#Dropping all the object columns for the first try
for col in data.columns:
    if data[col].dtype == object:
        data.drop([col], axis=1, inplace=True)

In [158]:
data.shape

In [159]:
data.head(20)

In [160]:
# Separating the dependent and independent columns
y = data["SalePrice"]
X = data.drop(["Id", "SalePrice"], axis=1)

In [161]:
X.head()

In [162]:
y.head()

In [163]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Splitting the dataset into train test dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=121)

In [164]:
# Applying linear regression to the target variable
linear = LinearRegression()
linear.fit(X_train, y_train)

In [165]:
# Predicting the X_test
pred = linear.predict(X_test)
actual = y_test

In [166]:
# Calculating the r2 score of the linear regeression
linear.score(X_test, y_test)

In [167]:
# Calculating the MAE, MSE and RMSE of the linear regeression
print("Metrics to evaluate Regression model are :")
mae = mean_absolute_error(actual, pred)
print("Linear Regression MAE:", mae)
mse = mean_squared_error(actual, pred)
print("Linear Regression MSE:", mse)
print("Linear Regression RMSE:", np.sqrt(mse))
print("LR Coef:", linear.coef_)

In [168]:
og_data.shape

In [169]:
og_data.isnull().sum()

In [170]:
# Now we want to preprocess a little and then pass this for prediction for copying the original dataset
preprocess_data = og_data.copy()

In [171]:
#Calculating what is the 40% of a column
preprocess_data.shape[0]*0.4

In [172]:
#Dropping the columns which have more than 40% null values
for col in preprocess_data.columns:
    if preprocess_data[col].isnull().sum() > preprocess_data.shape[0]*0.4:
        print(col)
        preprocess_data.drop([col], axis=1, inplace=True)

In [173]:
# Checking the number of unique values in discrete columns
for col in preprocess_data.columns:
    if preprocess_data[col].dtype == "object":
        print(col, preprocess_data[col].nunique())

In [174]:
# Checking the number of unique values in continuous columns
for col in preprocess_data.columns:
    if preprocess_data[col].dtype != "object":
        print(col, preprocess_data[col].nunique())

In [175]:
preprocess_data.head()

In [176]:
# Classifying the features as discrete and continuous features
discrete_columns = []
continuous_columns = []
for x in preprocess_data.columns :
    if preprocess_data[x].dtype == "object":
        discrete_columns.append(x)
    else:
        continuous_columns.append(x)
print("Discrete columns : ", discrete_columns)
print("Continuous columns : ", continuous_columns)

In [177]:
# Checking for outliers in these columns by plotting the boxplot
for x in continuous_columns:
    plt.figure(figsize=(16,8))
    sns.boxplot(preprocess_data[x])
    plt.show()

In [178]:
# Checking for outliers in these columns by plotting the boxplot
for x in continuous_columns:
    plt.figure(figsize=(16,8))
    plt.hist(preprocess_data[x], bins=50, label=x)
    plt.title(x)
    plt.show()

In [31]:
# Checking the influence of labels in the categorical columns/discrete columns
for x in discrete_columns:
    plt.figure(figsize=(16,8))
    sns.boxplot(x=preprocess_data[x], y=preprocess_data['SalePrice'])
    plt.title(x)
    plt.show()

In [179]:
# Helper functions for removing the outliers in the dataset
def outlier_thresholds(dataframe, col_name, q1 = 0.25, q3 = 0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def replace_with_thresholds(dataframe, variable, q1 = 0.25, q3 = 0.75):
    low_limit, up_limit = outlier_thresholds(dataframe, variable, q1 = q1, q3 = q3)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit
    return dataframe
    
for col in continuous_columns:
    preprocess_data = replace_with_thresholds(preprocess_data, col, q1 = 0.1, q3 = 0.9)

In [180]:
# Checking for null values
preprocess_data.isnull().sum()

In [181]:
# Imputing the null values i.e for discrete imputing it with the mode and for continuous imputing it with median
for col in preprocess_data.columns:
    if preprocess_data[col].isnull().sum() > 0:
        print(preprocess_data[col].isnull().sum())
        if col in discrete_columns:
            val = preprocess_data[col].mode()
            print(type(val[0]))
            preprocess_data[col].fillna(val[0], inplace =True)
        else:
            val = preprocess_data[col].median()
            print(col, val)
            preprocess_data[col].fillna(val,  inplace =True)

In [182]:
# Checking for null values again
preprocess_data.isnull().sum()

In [183]:
#Transforming the discrete columns into numeric by Label Encoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for column in preprocess_data.columns:
    if column in discrete_columns:
        preprocess_data[column] = le.fit_transform(preprocess_data[column])

In [184]:
preprocess_data.head()

In [185]:
preprocess_data.shape

In [186]:
# Understanding the correlation between the features
train_corr = preprocess_data.corr()
plt.figure(figsize=(16,12))
sns.heatmap(train_corr, vmax=0.8)
corr_values=train_corr['SalePrice'].sort_values(ascending=False)
corr_values=abs(corr_values).sort_values(ascending=False)
print("Correlation of mentioned features wrt outcome in ascending order")
print(abs(corr_values).sort_values(ascending=False))

In [187]:
# Separating the dependent and independent columns
y = preprocess_data['SalePrice']
X = preprocess_data.drop(['SalePrice'], axis=1)

In [188]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Splitting the dataset into train test dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [189]:
# Applying linear regression to the target variable
linear = LinearRegression()
linear.fit(X_train, y_train)


# Predicting the X_test
pred = linear.predict(X_test)
actual = y_test

#Calculating the r2 score for linear regression
linear.score(X_test, y_test)

In [190]:
# Calculating the MAE, MSE and RMSE of the linear regeression
print("Metrics to evaluate Regression model are :")
mae = mean_absolute_error(actual, pred)
print("Linear Regression MAE:", mae)
mse = mean_squared_error(actual, pred)
print("Linear Regression MSE:", mse)
print("Linear Regression RMSE:", np.sqrt(mse))
print("LR Coef:", linear.coef_)

In [102]:
# Reading the testing dataset
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
test_data.head(20)

In [103]:
# Checking for null values 
test_data.isnull().sum()

In [104]:
#Dropping the columns which have more than 40% null values
for col in test_data.columns:
    if test_data[col].isnull().sum() > test_data.shape[0]*0.4:
        print(col)
        test_data.drop([col], axis=1, inplace=True)

In [105]:
# Classifying the features as discrete and continuous features
test_discrete_columns = []
test_continuous_columns = []
for x in test_data.columns :
    if test_data[x].dtype == "object":
        test_discrete_columns.append(x)
    else:
        test_continuous_columns.append(x)
print("Discrete columns : ", test_discrete_columns)
print("Continuous columns : ", test_continuous_columns)

In [106]:
# Imputing the null values i.e for discrete imputing it with the mode and for continuous imputing it with median
for col in test_data.columns:
    if test_data[col].isnull().sum() > 0:
        print(test_data[col].isnull().sum())
        if col in test_discrete_columns:
            val = test_data[col].mode()
            print(type(val[0]))
            test_data[col].fillna(val[0], inplace =True)
        else:
            val = test_data[col].median()
            print(col, val)
            test_data[col].fillna(val,  inplace =True)

In [107]:
# Checking for null values again
test_data.isnull().sum()

In [108]:
# Removing the outliers in the dataset
    
for col in test_continuous_columns:
    test_data = replace_with_thresholds(test_data, col, q1 = 0.1, q3 = 0.9)

In [112]:
test_data.shape

In [113]:
#Transforming the discrete columns into numeric by Label Encoding
for column in test_data.columns:
    if column in discrete_columns:
        test_data[column] = le.fit_transform(test_data[column])

In [114]:
# Predicting the X_test
pred = linear.predict(test_data)

In [116]:
# Predicted values for test data
pred

In [117]:
test_data.head()

In [118]:
final_op = pd.concat([test_data["Id"], pd.Series(pred)], axis=1)

In [119]:
final_op.head()

In [93]:
final_op = final_op.rename({0:"SalePrice"}, axis=1)
final_op.head()

In [94]:
final_op.to_csv("Submissions.csv", index=False)