# Preprocessing Techniques

## Encoder

In [1]:
#Import libraries
from sklearn import preprocessing
import pandas as pd

In [2]:
# Create Dataframe of a data
df = pd.DataFrame({'Name':['Jayesh','Aman','Mahima'],'Gender':['Male','Male',"Female"]})
df

Unnamed: 0,Name,Gender
0,Jayesh,Male
1,Aman,Male
2,Mahima,Female


#### 1. Label Encoder

In [3]:
# Encoding of Gender List
new_df = df.copy()  # Making copy of main dataframe
new_df['Gender']

0      Male
1      Male
2    Female
Name: Gender, dtype: object

In [4]:
# 1. Label Encoder
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(new_df['Gender'])

label_classes = label_encoder.classes_
print(label_classes)   # It will show all types of categories present in the coloum.

label_transform = label_encoder.transform(new_df['Gender'])
print(label_transform)   # It will encode the string data into numerical data.

label_inverse_transform = label_encoder.inverse_transform(label_transform)
print(label_inverse_transform)   # It will decode the numerical data into string format.

['Female' 'Male']
[1 1 0]
['Male' 'Male' 'Female']


In [5]:
# Change in new dataframe
new_df['Gender'] = label_encoder.transform(new_df['Gender'])
new_df

Unnamed: 0,Name,Gender
0,Jayesh,1
1,Aman,1
2,Mahima,0


#### 2. One Hot Encoder

In [9]:
new_df = df.copy()
new_df

Unnamed: 0,Name,Gender
0,Jayesh,Male
1,Aman,Male
2,Mahima,Female


In [7]:
# 2. One hot Encoder
# one_hot_encoder = preprocessing.OneHotEncoder(handle_unknown='ignore',categories = ['Gender'])
# one_hot_encoder.fit()
# pending...

In [8]:
# Also we can do from get_dummies() 
# one_hot_encoded_data = pd.get_dummies(new_df, columns = ['Gender'])
# print(one_hot_encoded_data)

# Train and Test Split

In [2]:
# Import iris dataset from sklearn
from sklearn import datasets

In [3]:
# Load iris dataset
iris = datasets.load_iris()

In [8]:
# Making dataframe of data
df = pd.DataFrame(iris['data'],columns=iris['feature_names'])
df['target'] = iris['target']
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [34]:
# Performing train_test_split method using sklearn
from sklearn.model_selection import train_test_split
x = df.iloc[:,:4]  # This are the features of the data
y = df.iloc[:,4]   # This are the labels of the data
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [35]:
X_train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
81,5.5,2.4,3.7,1.0
133,6.3,2.8,5.1,1.5
137,6.4,3.1,5.5,1.8
75,6.6,3.0,4.4,1.4
109,7.2,3.6,6.1,2.5


In [36]:
X_test.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
73,6.1,2.8,4.7,1.2
18,5.7,3.8,1.7,0.3
118,7.7,2.6,6.9,2.3
78,6.0,2.9,4.5,1.5
76,6.8,2.8,4.8,1.4


In [37]:
Y_train.head()

81     1
133    2
137    2
75     1
109    2
Name: target, dtype: int32

In [38]:
Y_test.head()

73     1
18     0
118    2
78     1
76     1
Name: target, dtype: int32

# Rescaling the Data

In [39]:
# Making dataframe
df = pd.DataFrame([
    [25000,200,30.5],
    [18000,150,12.5],
    [9000,100,12.7],
    [40000,300,21.2]
],columns=['a','b','c'])
df

Unnamed: 0,a,b,c
0,25000,200,30.5
1,18000,150,12.5
2,9000,100,12.7
3,40000,300,21.2


### Technique 1 - Maximum Absolute Scaling (-1 to 1) 

In [49]:
# Making copy of main dataset
df_copy = df.copy()
df_copy

Unnamed: 0,a,b,c
0,25000,200,30.5
1,18000,150,12.5
2,9000,100,12.7
3,40000,300,21.2


In [42]:
for col in df_copy.columns:
    df_copy[col] = df_copy[col]/df_copy[col].abs().max()
df_copy

Unnamed: 0,a,b,c
0,0.625,0.666667,1.0
1,0.45,0.5,0.409836
2,0.225,0.333333,0.416393
3,1.0,1.0,0.695082


### Techinque 2 - Min-Max scaling(Normalization)(0-1)

In [50]:
# Making copy of main dataset
df_copy = df.copy()
df_copy

Unnamed: 0,a,b,c
0,25000,200,30.5
1,18000,150,12.5
2,9000,100,12.7
3,40000,300,21.2


In [51]:
for i in df_copy.columns:
    df_copy[i] = (df_copy[i] - df_copy[i].min())/(df_copy[i].max() - df_copy[i].min())
df_copy

Unnamed: 0,a,b,c
0,0.516129,0.5,1.0
1,0.290323,0.25,0.0
2,0.0,0.0,0.011111
3,1.0,1.0,0.483333


### Techinque 2 - Z-score Method(Standarisation)(-3 to 3)

In [53]:
df_copy = df.copy()
df_copy

Unnamed: 0,a,b,c
0,25000,200,30.5
1,18000,150,12.5
2,9000,100,12.7
3,40000,300,21.2


In [56]:
for i in df_copy.columns:
    df_copy[i] = (df_copy[i]-df_copy[i].mean())/df_copy[i].std()
df_copy

Unnamed: 0,a,b,c
0,0.152795,0.146385,1.320159
1,-0.381987,-0.439155,-0.787412
2,-1.069565,-1.024695,-0.763994
3,1.298757,1.317465,0.231247
