In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd

# Preprocess the Data that we gathered

In [5]:
#dataset --> https://www.kaggle.com/datasets/yasserh/housing-prices-dataset
df = pd.read_csv("../DataSet/Housing.csv")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [7]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [8]:
data = pd.get_dummies(df, drop_first=True) # nominal data
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,True,False,False,False,True,True,False,False
1,12250000,8960,4,4,4,3,True,False,False,False,True,False,False,False
2,12250000,9960,3,2,2,2,True,False,True,False,False,True,True,False
3,12215000,7500,4,2,2,3,True,False,True,False,True,True,False,False
4,11410000,7420,4,1,2,2,True,True,True,False,True,False,False,False


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   price                            545 non-null    int64
 1   area                             545 non-null    int64
 2   bedrooms                         545 non-null    int64
 3   bathrooms                        545 non-null    int64
 4   stories                          545 non-null    int64
 5   parking                          545 non-null    int64
 6   mainroad_yes                     545 non-null    bool 
 7   guestroom_yes                    545 non-null    bool 
 8   basement_yes                     545 non-null    bool 
 9   hotwaterheating_yes              545 non-null    bool 
 10  airconditioning_yes              545 non-null    bool 
 11  prefarea_yes                     545 non-null    bool 
 12  furnishingstatus_semi-furnished  545 non-null    b

In [10]:
x = data.iloc[:, 1:] # independed variable 

In [11]:
x.shape # in sklearn mulilinear model accept it as 2 dim only

(545, 13)

In [30]:
y = df[["price"]] # depended variable

In [32]:
y.head()

Unnamed: 0,price
0,13300000
1,12250000
2,12250000
3,12215000
4,11410000


In [31]:
y.shape # in sklearn linear model acceps target vairalb as 1 dim

(545, 1)

# Normalization

In [14]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [15]:
xdata = sc.fit_transform(x)
ydata = sc.fit_transform(y.values.reshape(-1, 1)).ravel()

# Split the data into test and train at the ratio of 8:2

In [17]:
x_train, x_test, y_train, y_test = train_test_split(xdata, ydata, test_size=0.2, random_state=0) # herea random state help to give same result of split when ever run the code

In [18]:
x_train.size, y_train.size, x_test.size, y_test.size

(5668, 436, 1417, 109)

# Train a Model

In [19]:
model = LinearRegression()

In [20]:
model.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [21]:
model.coef_ #weight

array([ 0.29364549,  0.04817909,  0.2678242 ,  0.20104853,  0.09986458,
        0.08220378,  0.0824566 ,  0.08291308,  0.13555482,  0.21671241,
        0.15723722, -0.02813754, -0.12341684])

In [22]:
model.intercept_ #bias

np.float64(0.008894171293725868)

# Evalute Model Accuracy

In [23]:
from sklearn.metrics import r2_score

In [24]:
y_pred = model.predict(x_test)

In [25]:
r2_score(y_test, y_pred) 

0.6611214250980104

# Export Model

In [26]:
import joblib

In [27]:
joblib.dump(model, "multiLinearModel.joblib")

['multiLinearModel.joblib']

# Load model and predict

In [28]:
loadedmodel = joblib.load("multiLinearModel.joblib")

In [29]:
loadedmodel.predict([[7420, 4, 2, 3, 2, True, False, False, False, True, True, False, False]])

array([2180.84583843])