In [None]:
'''
Categorical Variables are of two types:
    1. Nominal (Where the categories don't have any numerical ordering between them. It means, no category is better than other)
        Ex. {Male, Female}, {red, blue, green}, {monroe township, robbinsville, west windsor}
    
    2. Ordinal (Where the categories have some kind of ordering between them)
        Ex. {Satisfied, neutral, dissatisfied}, {graduate, masters, phd}, {high, medium, low}
        
When we are dealing with Nominal variables, simple integer encoding will not work, i.e., assigning red=1,blue=2,green=3 won't work
Hence, we use a technique called "One Hot Encoding". In this technique, we create one column for each nominal varibale and assign values.
Ex. In case of {red, blue, green}, we create three columns red, blue, green and to denote red, we use 1, 0, 0.
'''

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model

In [2]:
df = pd.read_csv("Datasets/homeprices_for_different_townships.csv")
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [4]:
#Let us use pandas dummies to predict the home price
#We will now create columns for each nominal variable (monroe, west windsor, robbinsville)

dummies = pd.get_dummies(df.town)
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [6]:
#Now lets concatenate the dummies dataframe with our original dataframe
#you can also use axis = 'columns' instead of axis = 1

merged = pd.concat([df,dummies], axis = 1)
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [7]:
#From the above dataframe, we can drop the 'town' column as it is not required for our prediction anymore
#Also, a rule of thumb is that, we must drop one dummy column whenever we are dealing with two or more dummy variables created by one-hot encoding are highly correlated (multi-collinear) 
#It means that, This means that one variable can be predicted from the others, making it difficult to interpret predicted coefficient variables in regression models.
#Hence, let us also drop one of the dummy variables

final = merged.drop(['town','west windsor'], axis = 1)
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [8]:
model = linear_model.LinearRegression()

In [9]:
#area,monroe township,robinsville are our independent variables (X)
#price is our dependent variable (y)

X = final.drop(['price'], axis = 1)
X

Unnamed: 0,area,monroe township,robinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,0,1


In [10]:
y = final.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [11]:
model.fit(X,y)

LinearRegression()

In [12]:
#lets now predict the price of 2800 sqft home in robbinsville
model.predict([[2800,0,1]])



array([590775.63964739])

In [13]:
#lets now predict the price of 3400 sqft home in west windsor
model.predict([[3400,0,0]])




array([681241.66845839])

In [14]:
#How accurate is our model?
model.score(X,y)

0.9573929037221873

In [15]:
#Now let us use sklearn one hot encoder instead of pandas dummies

df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [17]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [18]:
#This step is assigning values to each of the town
dfle = df
dfle.town = le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [19]:
X = dfle[['town', 'area']].values
X

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]], dtype=int64)

In [20]:
y = dfle.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [27]:
#Now we have to create dummy variables using sklear one hot encoder

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

In [29]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],
    remainder='passthrough' 
)
X = ct.fit_transform(X)
X

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

In [30]:
#munroe township, 2600sqft is represented as [1.0, 0.0, 0.0, 2.6e+03]
#now let us drop first column to avoid the dummy variable trap

X = X[:,1:]
X

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [31]:
model.fit(X,y)

LinearRegression()

In [33]:
#lets now predict the price of 2800 sqft home in robbinsville

model.predict([[1,0,2800]])

array([590775.63964739])

In [37]:
#lets now predict the price of 3400 sqft home in west windsor

model.predict([[0,1,3400]])


array([681241.6684584])