<a href="https://colab.research.google.com/github/futurexskill/ai/blob/master/Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import the libraries

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Read the dataset

In [0]:
dataset = pd.read_csv('https://raw.githubusercontent.com/futurexskill/ai/master/retailstore.csv')

## Get Info

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
Age          9 non-null float64
Salary       8 non-null float64
Gender       10 non-null object
Country      10 non-null object
Purchased    10 non-null object
dtypes: float64(2), object(3)
memory usage: 480.0+ bytes


## Print records

In [4]:
dataset

Unnamed: 0,Age,Salary,Gender,Country,Purchased
0,18.0,20000.0,Male,Germany,N
1,19.0,22000.0,Female,France,N
2,20.0,24000.0,Female,England,N
3,21.0,,Male,England,N
4,22.0,50000.0,Male,France,Y
5,23.0,35000.0,Female,England,N
6,24.0,,Male,Germany,N
7,25.0,32000.0,Female,France,Y
8,,35000.0,Male,Germany,N
9,27.0,37000.0,Female,France,N


## Separate the Independent and Dependent variables


In [5]:
X = dataset.iloc[:, :-1].values
X

array([[18.0, 20000.0, 'Male', 'Germany'],
       [19.0, 22000.0, 'Female', 'France'],
       [20.0, 24000.0, 'Female', 'England'],
       [21.0, nan, 'Male', 'England'],
       [22.0, 50000.0, 'Male', 'France'],
       [23.0, 35000.0, 'Female', 'England'],
       [24.0, nan, 'Male', 'Germany'],
       [25.0, 32000.0, 'Female', 'France'],
       [nan, 35000.0, 'Male', 'Germany'],
       [27.0, 37000.0, 'Female', 'France']], dtype=object)

In [6]:
y = dataset.iloc[0:,-1].values
y


array(['N', 'N', 'N', 'N', 'Y', 'N', 'N', 'Y', 'N', 'N'], dtype=object)

## Replacing the missing values

### Imputer class

In [0]:
from sklearn.preprocessing import Imputer

In [8]:
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)



In [0]:
imputer = imputer.fit(X[:, 0:2])

In [0]:
X[:, 0:2] = imputer.transform(X[:, 0:2])

In [11]:
X

array([[18.0, 20000.0, 'Male', 'Germany'],
       [19.0, 22000.0, 'Female', 'France'],
       [20.0, 24000.0, 'Female', 'England'],
       [21.0, 31875.0, 'Male', 'England'],
       [22.0, 50000.0, 'Male', 'France'],
       [23.0, 35000.0, 'Female', 'England'],
       [24.0, 31875.0, 'Male', 'Germany'],
       [25.0, 32000.0, 'Female', 'France'],
       [22.11111111111111, 35000.0, 'Male', 'Germany'],
       [27.0, 37000.0, 'Female', 'France']], dtype=object)

In [12]:
dataset.describe()

Unnamed: 0,Age,Salary
count,9.0,8.0
mean,22.111111,31875.0
std,2.934469,9818.895777
min,18.0,20000.0
25%,20.0,23500.0
50%,22.0,33500.0
75%,24.0,35500.0
max,27.0,50000.0


## Encoding the Independent variables


### Label Encoder



In [0]:
from sklearn.preprocessing import LabelEncoder 
labelencoder_X = LabelEncoder()

Convert Age to Numerical value

In [0]:
X[:, 2] = labelencoder_X.fit_transform(X[:, 2])

In [15]:
X

array([[18.0, 20000.0, 1, 'Germany'],
       [19.0, 22000.0, 0, 'France'],
       [20.0, 24000.0, 0, 'England'],
       [21.0, 31875.0, 1, 'England'],
       [22.0, 50000.0, 1, 'France'],
       [23.0, 35000.0, 0, 'England'],
       [24.0, 31875.0, 1, 'Germany'],
       [25.0, 32000.0, 0, 'France'],
       [22.11111111111111, 35000.0, 1, 'Germany'],
       [27.0, 37000.0, 0, 'France']], dtype=object)

Convert Country to Numeric Value

In [0]:
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])

In [17]:
X

array([[18.0, 20000.0, 1, 2],
       [19.0, 22000.0, 0, 1],
       [20.0, 24000.0, 0, 0],
       [21.0, 31875.0, 1, 0],
       [22.0, 50000.0, 1, 1],
       [23.0, 35000.0, 0, 0],
       [24.0, 31875.0, 1, 2],
       [25.0, 32000.0, 0, 1],
       [22.11111111111111, 35000.0, 1, 2],
       [27.0, 37000.0, 0, 1]], dtype=object)

### One Hot Encoder

In [0]:
from sklearn.preprocessing import OneHotEncoder

In [0]:
onehotencoder = OneHotEncoder(categorical_features = [3])


In [20]:
X = onehotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [21]:
X

array([[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.80000000e+01,
        2.00000000e+04, 1.00000000e+00],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.90000000e+01,
        2.20000000e+04, 0.00000000e+00],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.00000000e+01,
        2.40000000e+04, 0.00000000e+00],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.10000000e+01,
        3.18750000e+04, 1.00000000e+00],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 2.20000000e+01,
        5.00000000e+04, 1.00000000e+00],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.30000000e+01,
        3.50000000e+04, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.40000000e+01,
        3.18750000e+04, 1.00000000e+00],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 2.50000000e+01,
        3.20000000e+04, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.21111111e+01,
        3.50000000e+04, 

Remove the First Column

In [0]:
X = X[:,1:]

In [23]:
X

array([[0.00000000e+00, 1.00000000e+00, 1.80000000e+01, 2.00000000e+04,
        1.00000000e+00],
       [1.00000000e+00, 0.00000000e+00, 1.90000000e+01, 2.20000000e+04,
        0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 2.00000000e+01, 2.40000000e+04,
        0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 2.10000000e+01, 3.18750000e+04,
        1.00000000e+00],
       [1.00000000e+00, 0.00000000e+00, 2.20000000e+01, 5.00000000e+04,
        1.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 2.30000000e+01, 3.50000000e+04,
        0.00000000e+00],
       [0.00000000e+00, 1.00000000e+00, 2.40000000e+01, 3.18750000e+04,
        1.00000000e+00],
       [1.00000000e+00, 0.00000000e+00, 2.50000000e+01, 3.20000000e+04,
        0.00000000e+00],
       [0.00000000e+00, 1.00000000e+00, 2.21111111e+01, 3.50000000e+04,
        1.00000000e+00],
       [1.00000000e+00, 0.00000000e+00, 2.70000000e+01, 3.70000000e+04,
        0.00000000e+00]])

## Encoding the dependent value

In [0]:
labelencoder_y = LabelEncoder()

In [0]:
y = labelencoder_X.fit_transform(X[:, 0])

In [26]:
y

array([0, 1, 0, 0, 1, 0, 0, 1, 0, 1])

## Split the Training and Test Data


In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =.20,random_state=100)

## Feature Scaling

In [0]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Build a Classification Model

In [29]:
from sklearn.neighbors import KNeighborsClassifier
# minkowski is for ecledian distance
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

## Evaluate the Model

In [0]:
y_pred = classifier.predict(X_test)

In [0]:
from sklearn.metrics import confusion_matrix, accuracy_score


In [32]:
cm = confusion_matrix(y_test, y_pred)
cm


array([[1, 0],
       [0, 1]])

In [33]:
accuracy_score(y_test,y_pred)

1.0