In [60]:
import pandas as pd
df = pd.read_csv("homeprices.csv")
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robbinsville,2600,575000


Using get_dummies from pandas to implement the One Hot Encoding

In [61]:
#get_dummies create new columns with True/False or 0/1 value corresponding the categorical value in the column Town)
dummies = pd.get_dummies(df.town)
dummies

Unnamed: 0,monroe township,robbinsville,west windsor
0,True,False,False
1,True,False,False
2,True,False,False
3,True,False,False
4,True,False,False
5,False,False,True
6,False,False,True
7,False,False,True
8,False,False,True
9,False,True,False


In [62]:
#merged the new columns with the original tab
merged = pd.concat([df,dummies],axis='columns')
merged

Unnamed: 0,town,area,price,monroe township,robbinsville,west windsor
0,monroe township,2600,550000,True,False,False
1,monroe township,3000,565000,True,False,False
2,monroe township,3200,610000,True,False,False
3,monroe township,3600,680000,True,False,False
4,monroe township,4000,725000,True,False,False
5,west windsor,2600,585000,False,False,True
6,west windsor,2800,615000,False,False,True
7,west windsor,3300,650000,False,False,True
8,west windsor,3600,710000,False,False,True
9,robbinsville,2600,575000,False,True,False


In [63]:
#Drop the Town column and on of the new column added
final = merged.drop(['town','west windsor'],axis='columns')
final

Unnamed: 0,area,price,monroe township,robbinsville
0,2600,550000,True,False
1,3000,565000,True,False
2,3200,610000,True,False
3,3600,680000,True,False
4,4000,725000,True,False
5,2600,585000,False,False
6,2800,615000,False,False
7,3300,650000,False,False
8,3600,710000,False,False
9,2600,575000,False,True


In [64]:
from sklearn.linear_model import LinearRegression

In [65]:
model = LinearRegression()

In [66]:
X = final.drop('price',axis='columns')
X

Unnamed: 0,area,monroe township,robbinsville
0,2600,True,False
1,3000,True,False
2,3200,True,False
3,3600,True,False
4,4000,True,False
5,2600,False,False
6,2800,False,False
7,3300,False,False
8,3600,False,False
9,2600,False,True


In [67]:
y = final.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [68]:
model.fit(X,y)

In [69]:
model.predict([[2800,0,1]])



array([590775.63964739])

In [70]:
model.predict([[3400,0,0]])



array([681241.66845839])

In [71]:
model.score(X,y)

0.9573929037221873

Use the LabelEncoder and OneHotEncoder from sklearn to implement the dummy variable

In [72]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [73]:
dfle = df
dfle.town = le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [107]:
X = dfle[['area']].values
X1 = dfle[['town']].values

In [108]:
y = dfle.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [109]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

In [110]:
u = ohe.fit_transform(X1).toarray()
u

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [111]:
ohe.categories_

[array([0, 1, 2])]

In [112]:
# Convert encoded data to DataFrame
u_pd = pd.DataFrame(u)
u_pd

Unnamed: 0,0,1,2
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
5,0.0,0.0,1.0
6,0.0,0.0,1.0
7,0.0,0.0,1.0
8,0.0,0.0,1.0
9,0.0,1.0,0.0


In [113]:
X1_pd = pd.DataFrame(X)
X1_pd

Unnamed: 0,0
0,2600
1,3000
2,3200
3,3600
4,4000
5,2600
6,2800
7,3300
8,3600
9,2600


In [117]:
# Concatenate with the rest of the dataset (excluding the original categorical column)
final_X = pd.concat([u_pd, X1_pd], axis=1)
final_X

Unnamed: 0,0,1,2,0.1
0,1.0,0.0,0.0,2600
1,1.0,0.0,0.0,3000
2,1.0,0.0,0.0,3200
3,1.0,0.0,0.0,3600
4,1.0,0.0,0.0,4000
5,0.0,0.0,1.0,2600
6,0.0,0.0,1.0,2800
7,0.0,0.0,1.0,3300
8,0.0,0.0,1.0,3600
9,0.0,1.0,0.0,2600


In [124]:
 final_X = final_X.drop([1],axis='columns')
 final_X

Unnamed: 0,0,2,0.1
0,1.0,0.0,2600
1,1.0,0.0,3000
2,1.0,0.0,3200
3,1.0,0.0,3600
4,1.0,0.0,4000
5,0.0,1.0,2600
6,0.0,1.0,2800
7,0.0,1.0,3300
8,0.0,1.0,3600
9,0.0,0.0,2600


In [126]:
model.fit(final_X,y)

In [128]:
model.predict([[0,0,2800]])

array([590775.63964739])