In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
dataset=pd.DataFrame({'A.':['Cat', 'Dog', 'Moose','Dog','Moose'],#animal
                     'Age':[4.0,17.0,6.0,8.0,4.0],             #age
                     'Iye':[72000.0,48000.0,54000.0,61000.0,np.nan],#amount
                     'Ore':['No','Yes','No','No','Yes']})#friendly
dataset

Unnamed: 0,A.,Age,Iye,Ore
0,Cat,4.0,72000.0,No
1,Dog,17.0,48000.0,Yes
2,Moose,6.0,54000.0,No
3,Dog,8.0,61000.0,No
4,Moose,4.0,,Yes


In [3]:
x=dataset.iloc[:,:-1].values #matrix form #get all rows and all but last column
x

array([['Cat', 4.0, 72000.0],
       ['Dog', 17.0, 48000.0],
       ['Moose', 6.0, 54000.0],
       ['Dog', 8.0, 61000.0],
       ['Moose', 4.0, nan]], dtype=object)

In [4]:
y=dataset.iloc[:,3].values #get all rows and last column
y

array(['No', 'Yes', 'No', 'No', 'Yes'], dtype=object)

PREPROCESSING NUMERICAL DATA

In [5]:
#dataset.fillna(dataset.mean())#fillna value

In [6]:
from sklearn.preprocessing import Imputer #to replace missing data(nan)
imputer = Imputer(missing_values=np.nan, strategy='mean',axis=0)



In [7]:
imputer = imputer.fit(x[:, 1:3])#to *fit imputer* to *columns that have missing data*  
                                #first colon means include all *rows*, while 1:3 means *column* 1 and 2
imputer

Imputer(axis=0, copy=True, missing_values=nan, strategy='mean', verbose=0)

In [8]:
x[:, 1:3]=imputer.transform(x[:, 1:3])#method to replace the missing data
x[:, 1:3]

array([[4.0, 72000.0],
       [17.0, 48000.0],
       [6.0, 54000.0],
       [8.0, 61000.0],
       [4.0, 58750.0]], dtype=object)

In [9]:
dataset #it can be seen that the above method doesnt affect the real dataset

Unnamed: 0,A.,Age,Iye,Ore
0,Cat,4.0,72000.0,No
1,Dog,17.0,48000.0,Yes
2,Moose,6.0,54000.0,No
3,Dog,8.0,61000.0,No
4,Moose,4.0,,Yes


PREPROCESSING CATEGORICAL COLUMNS

In [10]:
from sklearn.preprocessing import LabelEncoder #encode categorical values as numbers
                                            #it is used if *hierarchical values* are involved, but it is not the case here bcos cat is not higher than mouse
labelencoder_x= LabelEncoder()      
x[:,0]=labelencoder_x.fit_transform(x[:,0])#all rows and first column
x[:,0]

array([0, 1, 2, 1, 2], dtype=object)

In [11]:
from sklearn.preprocessing import OneHotEncoder #create dummy variable which shows 1 for true, 0 for false
onehotencoder=OneHotEncoder(categorical_features=[0])#specify index[0]

In [12]:
x = onehotencoder.fit_transform(x).toarray()
x#single column replaced by three other columns for each categorical variables

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[1.000e+00, 0.000e+00, 0.000e+00, 4.000e+00, 7.200e+04],
       [0.000e+00, 1.000e+00, 0.000e+00, 1.700e+01, 4.800e+04],
       [0.000e+00, 0.000e+00, 1.000e+00, 6.000e+00, 5.400e+04],
       [0.000e+00, 1.000e+00, 0.000e+00, 8.000e+00, 6.100e+04],
       [0.000e+00, 0.000e+00, 1.000e+00, 4.000e+00, 5.875e+04]])

In [13]:
labelencoder_y=LabelEncoder() #encode y column wch has 'yes' and 'no' variables #used labelencoder bcos its has two values although it is not hierarchical 
y=labelencoder_y.fit_transform(y)
y=y.reshape(-1,1)#was converted to 2D instead of the 1D if reshape isnt used
y

array([[0],
       [1],
       [0],
       [0],
       [1]])

TO TRAIN AND TEST

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)#create train and test set

After splitting train and testing set
1. call algorithmn model such LinearRegression, LogisticRegression
2. fit X and y training set
3. predict y_test and X_test
4. predict y_train and X_train
5. find metics such as MAE,MSE,RMSE

FEATURE SCALING: Putting all our features into the same scale so no one is predominated by the other

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
sc_x=StandardScaler() #pass method to sc_x
sc_x

StandardScaler(copy=True, with_mean=True, with_std=True)

In [18]:
x_train=sc_x.fit_transform(x_train)#fit and transform train set
x_train

array([[ 1.73205081, -1.        , -0.57735027, -0.80049874,  1.41533491],
       [-0.57735027,  1.        , -0.57735027,  1.64808563, -1.40066822],
       [-0.57735027,  1.        , -0.57735027, -0.04708816,  0.12466681],
       [-0.57735027, -1.        ,  1.73205081, -0.80049874, -0.13933349]])

In [19]:
x_test=sc_x.transform(x_test)#only transform test set
x_test

array([[-0.57735027, -1.        ,  1.73205081, -0.42379345, -0.69666744]])

In [20]:
sc_y=StandardScaler() #pass method to scale y
y_train=sc_y.fit_transform(y_train)
y_train
                           



array([[-1.],
       [ 1.],
       [-1.],
       [ 1.]])

CREATE AND TRAIN MODEL

In [21]:
from sklearn.linear_model import LinearRegression

In [22]:
lin_reg=LinearRegression() #creating a linear regression model object

In [23]:
lin_reg.fit(x_train,y_train)#train or fit my model on my training data #don't put in a variable

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

EVALUATE MODEL

In [24]:
print(lin_reg.intercept_)

[1.51096675e-17]


In [25]:
lin_reg.coef_

array([[ 0.10551434, -0.53279123,  0.50969998,  0.56250916, -0.68604549]])