# Predicting House prices with multiple variables

In [14]:
import polars as pl 
import pandas as pd 
from sklearn import linear_model
import matplotlib.pyplot as plt
import math  

In [17]:
df = pl.read_csv('homeprices_mult.csv')
df 

area,bedrooms,age,price
i64,i64,i64,i64
2600,3.0,20,550000
3000,4.0,15,565000
3200,,18,610000
3600,3.0,30,595000
4000,5.0,8,760000


In [18]:
median_bedrooms = math.floor(df['bedrooms'].median())
median_bedrooms

3

In [19]:
# fill nulls with median (int)
df = df.with_columns([
    pl.col('bedrooms').fill_null(median_bedrooms)
])

df

area,bedrooms,age,price
i64,i64,i64,i64
2600,3,20,550000
3000,4,15,565000
3200,3,18,610000
3600,3,30,595000
4000,5,8,760000


In [20]:
reg = linear_model.LinearRegression()

In [21]:
X = df.select([
    pl.col('area'), 
    pl.col('bedrooms'),
    pl.col('age')
])
X 

area,bedrooms,age
i64,i64,i64
2600,3,20
3000,4,15
3200,3,18
3600,3,30
4000,5,8


In [22]:
y = df.select(pl.col('price'))
y

price
i64
550000
565000
610000
595000
760000


In [23]:
reg.fit(X, y)

In [24]:
reg.coef_

array([[   137.25, -26025.  ,  -6825.  ]])

In [25]:
reg.intercept_

array([383725.])

In [28]:
reg.predict([[3000, 3, 40]]) # predict for 3000 sqft, 3 bedrooms, 40 years old 



array([[444400.]])

In [29]:
reg.predict([[2500, 4, 5]])



array([[588625.]])

# Predict Salaries

In [33]:
import polars as pl
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn import linear_model
import math 

In [31]:
df = pl.read_csv('hiring.csv')
df 

experience,test_score(out of 10),interview_score(out of 10),salary($)
str,i64,i64,i64
,8.0,9,50000
,8.0,6,45000
"""five""",6.0,7,60000
"""two""",10.0,10,65000
"""seven""",9.0,6,70000
"""three""",7.0,10,62000
"""ten""",,7,72000
"""eleven""",7.0,8,80000


In [32]:
# first, fill nulls in experience col with 'zero' 
df = df.with_columns(
    pl.col('experience').fill_null('zero')
)

df 

experience,test_score(out of 10),interview_score(out of 10),salary($)
str,i64,i64,i64
"""zero""",8.0,9,50000
"""zero""",8.0,6,45000
"""five""",6.0,7,60000
"""two""",10.0,10,65000
"""seven""",9.0,6,70000
"""three""",7.0,10,62000
"""ten""",,7,72000
"""eleven""",7.0,8,80000


In [34]:
# fill nulls in 'test_score(out of 10)' col with median int 

df = df.with_columns(
    pl.col('test_score(out of 10)').fill_null(math.floor(df['test_score(out of 10)'].median()))
)
df 

experience,test_score(out of 10),interview_score(out of 10),salary($)
str,i64,i64,i64
"""zero""",8,9,50000
"""zero""",8,6,45000
"""five""",6,7,60000
"""two""",10,10,65000
"""seven""",9,6,70000
"""three""",7,10,62000
"""ten""",8,7,72000
"""eleven""",7,8,80000


In [36]:
# change experience col values to integers 

experience_mapping = {
    'zero': 0,
    'one': 1,
    'two': 2,
    'three': 3, 
    'four': 4,
    'five': 5, 
    'six': 6, 
    'seven': 7,
    'eight': 8,
    'nine': 9,
    'ten': 10,
    'eleven': 11
}

df = df.with_columns(
    pl.col('experience').replace(experience_mapping, default=None).cast(pl.Int64)
)

df 

  pl.col('experience').replace(experience_mapping, default=None).cast(pl.Int64)


experience,test_score(out of 10),interview_score(out of 10),salary($)
i64,i64,i64,i64
0,8,9,50000
0,8,6,45000
5,6,7,60000
2,10,10,65000
7,9,6,70000
3,7,10,62000
10,8,7,72000
11,7,8,80000


In [38]:
X = df.select([
    pl.col('experience'),
    pl.col('test_score(out of 10)'),
    pl.col('interview_score(out of 10)')
])

X 

experience,test_score(out of 10),interview_score(out of 10)
i64,i64,i64
0,8,9
0,8,6
5,6,7
2,10,10
7,9,6
3,7,10
10,8,7
11,7,8


In [39]:
y = df.select(
    pl.col('salary($)')
)

y

salary($)
i64
50000
45000
60000
65000
70000
62000
72000
80000


In [40]:
model = linear_model.LinearRegression()

In [41]:
model.fit(X, y)

In [42]:
# predict 2, 9, 6
model.predict([[2,9,6]])



array([[53205.96797671]])

In [43]:
# predict 12, 10, 10
model.predict([[12,10,10]])



array([[92002.18340611]])