## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load CSV Dataset

### Test Data

#### Import Train CSV

In [2]:
train_data = pd.read_csv('datasets/house_price_dataset/train.csv')
train_data.head()

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,2.5,2590.0,sqft,6000.0,sqft,98144,795000.0
1,4,2.0,2240.0,sqft,0.31,acre,98106,915000.0
2,4,3.0,2040.0,sqft,3783.0,sqft,98107,950000.0
3,4,3.0,3800.0,sqft,5175.0,sqft,98199,1950000.0
4,2,2.0,1042.0,sqft,,,98102,950000.0


#### Drop Categorical Columns

In [3]:
train_data.drop(['size_units', 'lot_size_units'], axis=1, inplace=True)

In [4]:
train_data.tail()

Unnamed: 0,beds,baths,size,lot_size,zip_code,price
2011,3,2.0,1370.0,0.5,98112,910000.0
2012,1,1.0,889.0,,98121,550000.0
2013,4,2.0,2140.0,6250.0,98199,1150000.0
2014,2,2.0,795.0,,98103,590000.0
2015,3,2.0,1710.0,4267.0,98133,659000.0


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   beds      2016 non-null   int64  
 1   baths     2016 non-null   float64
 2   size      2016 non-null   float64
 3   lot_size  1669 non-null   float64
 4   zip_code  2016 non-null   int64  
 5   price     2016 non-null   float64
dtypes: float64(4), int64(2)
memory usage: 94.6 KB


In [6]:
train_data.describe()

Unnamed: 0,beds,baths,size,lot_size,zip_code,price
count,2016.0,2016.0,2016.0,1669.0,2016.0,2016.0
mean,2.857639,2.15997,1735.740575,3871.059694,98123.638889,963625.2
std,1.255092,1.002023,920.132591,2719.402066,22.650819,944095.4
min,1.0,0.5,250.0,0.23,98101.0,159000.0
25%,2.0,1.5,1068.75,1252.0,98108.0,601750.0
50%,3.0,2.0,1560.0,4000.0,98117.0,800000.0
75%,4.0,2.5,2222.5,6000.0,98126.0,1105250.0
max,15.0,9.0,11010.0,9998.0,98199.0,25000000.0


In [7]:
train_data.isnull().sum()

beds          0
baths         0
size          0
lot_size    347
zip_code      0
price         0
dtype: int64

#### Fill NaN values

In [8]:
# train_data['lot_size'].fillna(np.random.uniform(1000, 9999, 1)[0], inplace=True)
train_data['lot_size'].fillna(0, inplace=True)

In [9]:
train_data.isnull().sum()

NameError: name 'test_data' is not defined

In [None]:
train_data.columns

: 

#### EDA(Exploratory Data Analysis)

In [None]:
sns.pairplot(train_data)

: 

In [None]:
sns.displot(train_data['price'])

: 

In [None]:
sns.heatmap(train_data.corr())

: 

In [None]:
train_data

: 

#### Split testing data

In [None]:
y_train = train_data['price']
y_train.head()

: 

In [None]:
X_train = train_data.iloc[:, :-1]
X_train.head()

: 

### Test Data

#### Import CSV

In [None]:
test_data = pd.read_csv('datasets/house_price_dataset/test.csv')
test_data.head()

: 

#### Drop Categorical Columns

In [None]:
test_data.drop(['size_units', 'lot_size_units'], axis=1, inplace=True)

: 

In [None]:
test_data.head()

: 

In [None]:
test_data.tail()

: 

In [None]:
test_data.info()

: 

In [None]:
test_data.describe()

: 

In [None]:
test_data.isnull().sum()

: 

#### Fill NaN values

In [None]:
test_data['lot_size'].fillna(0, inplace=True)

: 

#### EDA(Exploratory Data Analysis)

In [None]:
sns.pairplot(test_data)

: 

In [None]:
sns.displot(test_data['price'])

: 

In [None]:
sns.heatmap(test_data.corr())

: 

#### Split testing data

In [None]:
y_test = test_data['price']
y_test.head()

: 

In [None]:
X_test = test_data.iloc[:, :-1]
X_test.head()

: 

### Split Data in traning & testing

In [None]:
# from sklearn.model_selection import train_test_split

: 

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

: 

In [None]:
X_train.shape

: 

In [None]:
y_train.shape

: 

In [None]:
X_test.shape

: 

In [None]:
y_test.shape

: 

In [None]:
from sklearn.linear_model import LinearRegression

: 

In [None]:
lm = LinearRegression()

: 

In [None]:
lm.fit(X_train, y_train)

: 

## Predictions

In [None]:
predictions = lm.predict(X_test)

: 

In [None]:
predictions.shape

: 

In [None]:
plt.scatter(y_test, predictions)

: 

In [None]:
sns.displot((y_test-predictions), bins=50)

: 

## Model R2 Score

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

: 

In [None]:
score=r2_score(y_test, predictions)
print('r2 socre is: ', score)
print('mean_sqrd_error is: ', mean_squared_error(y_test, predictions))
print('root_mean_squared error of is:', np.sqrt(mean_squared_error(y_test, predictions)))

: 

## Visualize Predictions

In [None]:
df = pd.DataFrame({'Actual': y_test, 'Predictions': predictions})
df

: 

: 