In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt #visualization
from sklearn.model_selection import train_test_split #for spliting the dataset
from sklearn.linear_model  import LinearRegression 
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('F:\\Machine Learning\Linear Regression\Class_5_Assignment_Dataset\Customers.csv')

In [3]:
df.head()

Unnamed: 0,age,income,gender,m_status,buys
0,25.0,high,male,single,no
1,25.0,high,male,married,no
2,35.0,high,male,single,yes
3,35.0,medium,male,single,yes
4,30.0,low,female,single,yes


In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,20.0,28.65,5.050013,22.0,25.0,27.5,35.0,35.0


In [5]:
df.isnull().sum()

age         2
income      0
gender      0
m_status    1
buys        0
dtype: int64

**Missing value handling of age**

In [6]:
mean = df.age.mean()
mean

28.65

In [7]:
df.age = df.age.fillna(mean)
df.age

0     25.00
1     25.00
2     35.00
3     35.00
4     30.00
5     32.00
6     22.00
7     28.65
8     25.00
9     35.00
10    25.00
11    35.00
12    22.00
13    35.00
14    28.65
15    25.00
16    35.00
17    22.00
18    25.00
19    30.00
20    30.00
21    25.00
Name: age, dtype: float64

**Missing value handling of m_status**

In [8]:
mode = df.m_status.mode()
mode

0    single
dtype: object

In [9]:
#df.m_status = df.m_status.fillna(mode)

In [10]:
df.m_status = df.m_status.fillna(mode.iloc[0]) #what is iloc[0] ?

In [11]:
df.m_status

0      single
1     married
2      single
3      single
4      single
5      single
6      single
7     married
8      single
9     married
10     single
11    married
12     single
13    married
14     single
15    married
16    married
17     single
18    married
19     single
20    married
21     single
Name: m_status, dtype: object

In [12]:
df.isnull().sum()

age         0
income      0
gender      0
m_status    0
buys        0
dtype: int64

**Encoding**

In [13]:
df.income.unique()

array(['high', 'medium', 'low'], dtype=object)

**One_Hot Encoding - Income**

In [14]:
dummy_var_in = pd.get_dummies(df['income'],drop_first=True) 

In [15]:
dummy_var_in.head()

Unnamed: 0,low,medium
0,0,0
1,0,0
2,0,0
3,0,1
4,1,0


In [16]:
new_df_in = df.drop('income',axis=1)

In [17]:
new_df_in.head()

Unnamed: 0,age,gender,m_status,buys
0,25.0,male,single,no
1,25.0,male,married,no
2,35.0,male,single,yes
3,35.0,male,single,yes
4,30.0,female,single,yes


In [18]:
df = pd.concat([new_df_in,dummy_var_in],axis=1)
df.head()

Unnamed: 0,age,gender,m_status,buys,low,medium
0,25.0,male,single,no,0,0
1,25.0,male,married,no,0,0
2,35.0,male,single,yes,0,0
3,35.0,male,single,yes,0,1
4,30.0,female,single,yes,1,0


**Ordinal - Gender**

In [19]:
df.gender.unique()

array(['male', 'female'], dtype=object)

In [20]:
gender_list = ['male', 'female']

In [21]:
ordinal = OrdinalEncoder(categories=[gender_list])

In [22]:
encoded_values = ordinal.fit_transform(df[['gender']]) # number of sample & number of feature

In [23]:
new_gender = pd.DataFrame(encoded_values, columns= ['gender'])

In [24]:
df.head()

Unnamed: 0,age,gender,m_status,buys,low,medium
0,25.0,male,single,no,0,0
1,25.0,male,married,no,0,0
2,35.0,male,single,yes,0,0
3,35.0,male,single,yes,0,1
4,30.0,female,single,yes,1,0


In [25]:
new_gender.head()

Unnamed: 0,gender
0,0.0
1,0.0
2,0.0
3,0.0
4,1.0


In [26]:
df.gender = new_gender

In [27]:
df.head()

Unnamed: 0,age,gender,m_status,buys,low,medium
0,25.0,0.0,single,no,0,0
1,25.0,0.0,married,no,0,0
2,35.0,0.0,single,yes,0,0
3,35.0,0.0,single,yes,0,1
4,30.0,1.0,single,yes,1,0


**Label Encoding - m_status**

In [28]:
label = LabelEncoder()

In [29]:
df.m_status = label.fit_transform(df['m_status'])

In [30]:
df.head()

Unnamed: 0,age,gender,m_status,buys,low,medium
0,25.0,0.0,1,no,0,0
1,25.0,0.0,0,no,0,0
2,35.0,0.0,1,yes,0,0
3,35.0,0.0,1,yes,0,1
4,30.0,1.0,1,yes,1,0


**label Encoding - Buys**

In [31]:
df.buys = label.fit_transform(df['buys'])

In [32]:
df.head()

Unnamed: 0,age,gender,m_status,buys,low,medium
0,25.0,0.0,1,0,0,0
1,25.0,0.0,0,0,0,0
2,35.0,0.0,1,1,0,0
3,35.0,0.0,1,1,0,1
4,30.0,1.0,1,1,1,0


In [33]:
df.isnull().sum()

age         0
gender      0
m_status    0
buys        0
low         0
medium      0
dtype: int64

**Table**

In [34]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,22.0,28.65,4.803521,22.0,25.0,28.65,34.25,35.0
gender,22.0,0.5,0.511766,0.0,0.0,0.5,1.0,1.0
m_status,22.0,0.590909,0.503236,0.0,0.0,1.0,1.0,1.0
buys,22.0,0.681818,0.476731,0.0,0.0,1.0,1.0,1.0
low,22.0,0.227273,0.428932,0.0,0.0,0.0,0.0,1.0
medium,22.0,0.409091,0.503236,0.0,0.0,0.0,1.0,1.0


In [35]:
x = df.drop('buys',axis=1)

In [36]:
x.head()

Unnamed: 0,age,gender,m_status,low,medium
0,25.0,0.0,1,0,0
1,25.0,0.0,0,0,0
2,35.0,0.0,1,0,0
3,35.0,0.0,1,0,1
4,30.0,1.0,1,1,0


In [37]:
y = df.drop(x,axis=1)

In [38]:
y.head()

Unnamed: 0,buys
0,0
1,0
2,1
3,1
4,1


In [39]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=.30, random_state = 12)

In [40]:
xtrain.head()

Unnamed: 0,age,gender,m_status,low,medium
16,35.0,0.0,0,0,1
18,25.0,1.0,0,0,1
9,35.0,1.0,0,0,1
5,32.0,1.0,1,1,0
4,30.0,1.0,1,1,0


In [41]:
ytrain.head()

Unnamed: 0,buys
16,1
18,1
9,1
5,0
4,1


In [42]:
xtrain.shape

(15, 5)

In [43]:
ytrain.shape

(15, 1)

**Train the model-Linear Regression**

In [44]:
#Creating object 
reg = LinearRegression()

In [45]:
##Training
reg.fit(xtrain, ytrain)

LinearRegression()

In [46]:
#Predicting
pred=reg.predict(xtest)
pred

array([[0.66943063],
       [1.26835163],
       [0.53066087],
       [0.3225606 ],
       [0.72156792],
       [0.65040398],
       [0.70385333]])

In [47]:
xtest

Unnamed: 0,age,gender,m_status,low,medium
7,28.65,0.0,0,0,1
10,25.0,1.0,1,0,1
21,25.0,0.0,1,1,0
15,25.0,1.0,0,0,0
14,28.65,0.0,1,0,0
8,25.0,1.0,1,1,0
13,35.0,0.0,0,0,1


In [48]:
ytest

Unnamed: 0,buys
7,0
10,1
21,1
15,1
14,0
8,1
13,0


In [49]:
mean_squared_error(ytest, pred)

0.33394863999830837

In [50]:
reg.score(xtest, ytest)

-0.3636236133264259