<a href="https://colab.research.google.com/github/jimmydev88/MachineLearning/blob/main/pipeline_ohe_ct_data_preprocessing_tools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing Tools

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [3]:
dataset = pd.read_csv('http://bit.ly/kaggletrain')
dataset.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
dataset = dataset.loc[dataset.Embarked.notna(), ['Survived','Pclass', 'Sex', 'Embarked']]

In [6]:
X = dataset.loc[:, ['Pclass']]
y = dataset.Survived

##Pipeline and cross validation

In [7]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression(solver = 'lbfgs')

In [8]:
from sklearn.model_selection import cross_val_score
cross_val_score(regressor, X, y, cv=5, scoring='accuracy').mean()

0.6783406335301212

In [9]:
y.value_counts(normalize=True)


0    0.617548
1    0.382452
Name: Survived, dtype: float64

In [10]:
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked
0,0,3,male,S
1,1,1,female,C
2,1,3,female,S
3,1,1,female,S
4,0,3,male,S


## Encoding categorical data

### Encoding the Independent Variable

In [11]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)

In [12]:
 ohe.fit_transform(dataset[['Sex']])


array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]])

### Encoding the Dependent Variable

In [13]:
ohe.categories_

[array(['female', 'male'], dtype=object)]

In [17]:
X=dataset.drop('Survived', axis= 'columns')
X.head()

Unnamed: 0,Pclass,Sex,Embarked
0,3,male,S
1,1,female,C
2,3,female,S
3,1,female,S
4,3,male,S


In [14]:
from sklearn.compose import make_column_transformer

In [18]:
column_trans = make_column_transformer((OneHotEncoder(), ['Sex','Embarked']), remainder = 'passthrough')

In [19]:
column_trans.fit_transform(X)

array([[0., 1., 0., 0., 1., 3.],
       [1., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 3.],
       ...,
       [1., 0., 0., 0., 1., 3.],
       [0., 1., 1., 0., 0., 1.],
       [0., 1., 0., 1., 0., 3.]])

In [20]:
from sklearn.pipeline import make_pipeline

In [21]:
pipe = make_pipeline(column_trans, regressor)

In [22]:
cross_val_score(pipe, X,y, cv=5,scoring='accuracy').mean()

0.7727924839713071

## Splitting the dataset into the Training set and Test set

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
print(X_train)

     Pclass     Sex Embarked
121       3    male        S
687       3    male        S
790       3    male        Q
837       3    male        S
659       1    male        C
..      ...     ...      ...
716       1  female        C
768       3    male        Q
73        3    male        C
236       2    male        S
37        3    male        S

[711 rows x 3 columns]


In [25]:
pipe.fit(X_train, y_train)
pipe.predict(X_test)

array([1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0])