In [11]:
# Import libraries
import pandas as pd
import numpy as np
import datetime

from sklearn import linear_model

In [2]:
# Import dataset
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [3]:
# Take the variables that will be used
df = df[['price', 'bedrooms', 'sqft_living', 'yr_built']]
df = df.astype(int)
df.head()

Unnamed: 0,price,bedrooms,sqft_living,yr_built
0,313000,3,1340,1955
1,2384000,5,3650,1921
2,342000,3,1930,1966
3,420000,3,2000,1963
4,550000,4,1940,1976


In [5]:
# Shows the basic statistics of the data
df.describe()

Unnamed: 0,price,bedrooms,sqft_living,yr_built
count,4600.0,4600.0,4600.0,4600.0
mean,551963.0,3.40087,2139.346957,1970.786304
std,563834.7,0.908848,963.206916,29.731848
min,0.0,0.0,370.0,1900.0
25%,322875.0,3.0,1460.0,1951.0
50%,460943.0,3.0,1980.0,1976.0
75%,654962.5,4.0,2620.0,1997.0
max,26590000.0,9.0,13540.0,2014.0


In [6]:
# Check for null values
df.isnull().sum()

price          0
bedrooms       0
sqft_living    0
yr_built       0
dtype: int64

In [7]:
# Check for zero values
(df == 0).sum()

price          49
bedrooms        2
sqft_living     0
yr_built        0
dtype: int64

In [8]:
# Drop the data with price value = 0
df.drop(index = df[df['price'] == 0].index, inplace = True)

In [9]:
# For bedrooms, replace missing data with median value
df.bedrooms = df.bedrooms.replace(0, df.bedrooms.median())

In [13]:
# Replace yr_built into yr_renovated if the house was renovated
now = datetime.datetime.now()

def age(df):
    return now.year - df['yr_built']
    
df['age'] = df.apply(age, axis = 1)

# Drop yr_built and yr_renovated
df.drop(columns = ['yr_built'], inplace = True)

df.head()

Unnamed: 0,price,bedrooms,sqft_living,age
0,313000,3,1340,66
1,2384000,5,3650,100
2,342000,3,1930,55
3,420000,3,2000,58
4,550000,4,1940,45


In [14]:
# Set X and y
X = df.drop(columns = ['price'])
y = df['price']

In [15]:
# Create linear regression object
reg = linear_model.LinearRegression()
reg.fit(X,y)

LinearRegression()

In [18]:
# Predict price of a home with 3 bedrooms, 3000 sqr ft living area, 40 year old
prices = reg.predict([[3, 3000, 40]])
print(prices[0])

831009.3054579223


In [19]:
# price = theta0 + theta1 * bedrooms + theta2 * sqft_living + theta3 * age
theta0 = reg.intercept_
theta1 = reg.coef_[0]
theta2 = reg.coef_[1]
theta3 = reg.coef_[2]

price = theta0 + theta1 * 3 + theta2 * 3000 + theta3 * 40

print(price)

831009.3054579222


## Exercise
The 'hiring.csv' file contains hiring statics for a firm such as experience of candidate, their written test score and personal interview score. Based on these 3 factors, HR will decide the salary. Given this data, you need to build a machine learning model for HR department that can help them decide salaries for future candidates. Using this predict salaries for following candidates,
 - 2 yr experience, 9 test score, 6 interview score
 - 12 yr experience, 10 test score, 10 interview score