# Data Preprocessing tools

## importing dependencies

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## importing dataset

In [2]:
dataset = pd.read_csv("./Data.csv")
# getting feature variable from first to third column and including all rows (data frame)
feature_x = dataset.iloc[ : , :-1].values
# getting dependent varibale from the very last column (data frame)
dependent_y = dataset.iloc[:, -1].values

print(feature_x)
print(dependent_y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking Care of missing data

In [3]:
from sklearn.impute import SimpleImputer
# replacing missing cell with mean (average)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') 

# replacing column 2 and column 3 with average if cell is empty
imputer.fit(feature_x[ : , 1:3 ])
feature_x[:, 1:3] = imputer.transform(feature_x[:, 1:3])
print(feature_x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


# Encoding categorical data

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder 

## Encoding independent variable

In [5]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
feature_x = np.array(ct.fit_transform(feature_x))
print(feature_x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


## encoding dependent variable

In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dependent_y = label_encoder.fit_transform(dependent_y)
print(dependent_y)

[0 1 0 0 1 1 0 1 0 1]


In [7]:
# Coding Exercise 3
# Importing the necessary libraries
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
df = pd.read_csv('./Titanic.csv')

# Identify the categorical data
categorical_features = ['Sex', 'Embarked', 'Pclass'] 

# Implement an instance of the ColumnTransformer class
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categorical_features)], remainder = "passthrough")

# Apply the fit_transform method on the instance of ColumnTransformer
X = ct.fit_transform(df)

# Convert the output into a NumPy array
X = np.array(X)

# Use LabelEncoder to encode binary categorical data
le = LabelEncoder()
y = le.fit_transform(df['Survived'])

# X_df = pd.DataFrame(X)
# print(X_df.head())
# Print the updated matrix of features and the dependent variable vector
print("updated matrix of features: \n", X)
print("updated dependent variable: \n", y)

updated matrix of features: 
 [[1.0 0.0 1.0 ... 'PC 17599' 71.2833 'C85']
 [1.0 0.0 0.0 ... '113803' 53.1 'C123']
 [0.0 1.0 0.0 ... '17463' 51.8625 'E46']
 ...
 [1.0 0.0 1.0 ... '11767' 83.1583 'C50']
 [1.0 0.0 0.0 ... '112053' 30.0 'B42']
 [0.0 1.0 1.0 ... '111369' 30.0 'C148']]
updated dependent variable: 
 [1 1 0 1 1 1 1 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 1 1 1 1 0 1 1
 1 1 1 0 1 0 0 1 0 0 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 0 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 0 0 1 0 1 0 1 0 1 1 1 0 1 1
 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 0 1 1 1 1 0 0 1 1 1 1 1
 0 1 1 1 1 1 0 1 0 0 1 1 1 1 0 1 1 0 0 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1]


## Spliting data set into training set and test set

we have to apply feature scaling after splitting the data set into training and test set

In [27]:
from sklearn.model_selection import train_test_split
# X = features
# Y = dependent
X = feature_x
y = dependent_y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [15]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [16]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [18]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [19]:
print(y_test)

[0 1]


## Feature Scaling

In [28]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:5] = sc.fit_transform(X_train[:, 3:5])
X_test[:, 3:5]  = sc.transform(X_test[:, 3:5])

In [29]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [30]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]


In [13]:
# Coding exercise 4
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Load the Iris dataset
df = pd.read_csv('./iris.csv')
# Separate features and target
features = df.iloc[ : , :-1 ].values
target = df.iloc[: , -1].values
# Split the dataset into an 80-20 training-test set
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state =  42)
# Apply feature scaling on the training and test sets
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Print the scaled training and test sets
# print("X_train_scaled: ", X_train)
# print("X_test_scaled: ", X_test)
# Xtrain_df = pd.DataFrame(X_train)
# Xtest_df = pd.DataFrame(X_test)
# print(Xtrain_df.head)
# print(Xtest_df.head)


In [31]:
# Exercise 5
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the Wine Quality Red dataset
df = pd.read_csv('./winequality-red.csv', delimiter = ';')
# Separate features and target
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
# Split the dataset into an 80-20 training-test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
# Create an instance of the StandardScaler class
sc = StandardScaler()
# Fit the StandardScaler on the features from the training set and transform it
X_train = sc.fit_transform(X_train)
# Apply the transform to the test set
X_test = sc.transform(X_test)
# Print the scaled training and test datasets
print("scaled X_test")
print(X_test)
print("scaled X_train")
print(X_train)

scaled X_test
[[-3.61859850e-01  1.64286407e-01 -9.85152962e-01 ... -4.65392578e-01
  -1.34389336e-04 -7.77452782e-01]
 [-3.03840702e-01 -1.70525408e-01 -5.24491803e-01 ...  5.08915214e-01
  -1.03143815e+00 -8.72484283e-01]
 [ 1.37871461e+00  7.78108067e-01 -2.68568937e-01 ... -2.05577167e-01
   1.83329452e+00 -4.92358280e-01]
 ...
 [-1.37449586e-02  3.87494284e-01 -1.15015218e-01 ... -1.04997725e+00
  -7.44964886e-01 -5.87389780e-01]
 [ 2.76350785e-01 -1.45397070e+00  6.01568807e-01 ... -1.04997725e+00
   1.71749571e-01  7.43051230e-01]
 [ 4.50408230e-01  1.30822677e+00 -1.18989125e+00 ... -1.40623314e-01
  -6.87670232e-01 -6.82421281e-01]]
scaled X_train
[[ 0.21833164  0.88971201  0.19209222 ...  1.09349989  0.45822284
   1.12317723]
 [-1.29016623 -1.78878251  0.65275338 ... -0.40043872 -0.40119696
   1.40827174]
 [ 1.49475291 -0.78434707  1.01104539 ... -0.07566946  0.51551749
  -0.58738978]
 ...
 [-0.65195559  0.49909822 -1.08752211 ...  1.28836145 -0.68767023
  -0.87248428]
 [-0.2