In [3]:
import pandas as pd 
import numpy as np 
from sklearn import linear_model
from word2number import w2n

In [4]:
# With multuple variables, the linear regression equation becomes: 
# price = m_1 * area + m_2 * bedrooms + m_3 * age + b

df = pd.read_csv("datasets/homeprices3.csv")
df.drop(5, inplace = True)
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000


In [55]:
# Handling the NaN value: We can replace the NaN values with the median of the column rather than deleting the row. 
import math
median_bedrooms = math.floor(df.bedrooms.median())
df.bedrooms  = df.bedrooms.fillna(median_bedrooms)
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,3.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000


In [4]:
reg = linear_model.LinearRegression()
reg.fit(df[['area', 'bedrooms', 'age']], df.price)

LinearRegression()

In [5]:
reg.coef_
reg.intercept_
print(f"""The coefficients of the model are {round(reg.coef_[0], 1)}. {round(reg.coef_[1], 1)} and {round(reg.coef_[2], 1)}. 
The intercept of the mdoel is give by {round(reg.intercept_, 1)}.""")

The coefficients of the model are 137.3. -26025.0 and -6825.0. 
The intercept of the mdoel is give by 383725.0.


In [6]:
"""Given these home prices, find out the price of a home that has:
1) 3000 sq ft area, 3 bedrooms, 40 years old.
2) 2500 sq ft, 4 bedrooms, 5 years old."""

preds = reg.predict([[3000, 3, 40], [2500, 4, 5]])
print(f"The prediction for 1) is {round(preds[0], 1)} and 2) is {round(preds[1], 1)}.")

The prediction for 1) is 444400.0 and 2) is 588625.0.


## Exercise: Hiring data
Make predictions on the salary based on multiple variables such as experience, test_score (out of 10) and interview_score (out of 10).

In [83]:
hiring_df = pd.read_csv('datasets/hiring.csv')
hiring_df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [84]:
# Cleaning data and handling missing values. 

# First, we want to handle the missing values. Change the missing values in 'experience' column to 'zero' and change to the median test 
# score in the "test_score (out of 10)" column. 

hiring_df['experience'].fillna('zero', inplace = True)
hiring_df['test_score(out of 10)'].fillna(math.floor(hiring_df['test_score(out of 10)'].mean()), inplace = True)

hiring_df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,7.0,7,72000
7,eleven,7.0,8,80000


In [85]:
# Convert string to numeric data in the "experience" column using the following functions:
# apply(): applies function to every row. 
# w2n.word_to_num(): converts string of a number to an actual number. 

hiring_df['experience'] = hiring_df['experience'].apply(w2n.word_to_num)

In [88]:
hiring_df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,7.0,7,72000
7,11,7.0,8,80000


In [96]:
# Fitting the model. 
reg = linear_model.LinearRegression()
reg.fit(hiring_df[['experience', 'test_score(out of 10)', 'interview_score(out of 10)']], hiring_df['salary($)'])

LinearRegression()

In [107]:
# Predict for the following features:
# 1) 2 year experience, 9 test score, 6 interview score.
# 2) 12 year experience, 10 test score, 10 interview score. 
preds = reg.predict([[2, 9, 6], [12, 10, 10]])
print(f"The prediction for 1) is {round(preds[0], 1)} and 2) is {round(preds[1], 1)}.")

The prediction for 1) is 53713.9 and 2) is 93747.8.
