# Column **Transform**

In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [3]:
df = pd.read_csv('/content/covid_toy.csv')
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [4]:
df.isnull().sum()

Unnamed: 0,0
age,0
gender,0
fever,10
cough,0
city,0
has_covid,0


In [5]:
from sklearn.model_selection import train_test_split

In [6]:
x_train , x_test , y_train , y_test = train_test_split(df.drop(
    columns = ['has_covid']) , df['has_covid'],test_size = 0.2)

In [7]:
x_train

Unnamed: 0,age,gender,fever,cough,city
78,11,Male,100.0,Mild,Bangalore
39,50,Female,103.0,Mild,Kolkata
84,69,Female,98.0,Strong,Mumbai
83,17,Female,104.0,Mild,Kolkata
19,42,Female,,Strong,Bangalore
...,...,...,...,...,...
70,68,Female,101.0,Strong,Delhi
88,5,Female,100.0,Mild,Kolkata
3,31,Female,98.0,Mild,Kolkata
42,27,Male,100.0,Mild,Delhi


# Manually type Output

In [9]:
# adding simple imputer to fever column
si = SimpleImputer(strategy="mean")
x_train_fever = si.fit_transform(x_train[['fever']])

# also the test data
x_test_fever = si.fit_transform(x_test[['fever']])
x_train_fever.shape

(80, 1)

In [10]:
#  ordinalEncoding --> Cough
oe = OrdinalEncoder(categories=[['Mild','Strong']])
x_train_cough = oe.fit_transform(x_train[['cough']])

# also the test data
x_test_cough = oe.fit_transform(x_test[['cough']])
x_train_cough.shape

(80, 1)

In [11]:
# OneHotEncoding --> Gender, City
ohe = OneHotEncoder(drop = 'first' , sparse_output = False)
x_train_gender_city = ohe.fit_transform(x_train[['gender','city']])

# also the test data
x_test_gender_city = ohe.fit_transform(x_test[['gender','city']])
x_train_gender_city.shape

(80, 4)

In [12]:
# Extracting Age
x_train_age = x_train.drop(columns =
                           ['gender','fever','cough','city']).values

# Also the test data
x_test_age = x_test.drop(columns =
                           ['gender','fever','cough','city']).values


In [13]:
x_train_age.shape

(80, 1)

In [14]:
x_train_transformed = np.concatenate((x_train_age,x_train_fever,
                                      x_train_gender_city,x_train_cough),
                                     axis =1)

In [15]:
x_train_transformed.shape

(80, 7)

# By The Help Of Column Transformer

In [16]:
from sklearn.compose import ColumnTransformer  # this is how to import ColumnTransformer


In [19]:
transformer = ColumnTransformer(transformers=[
    ('tnf1' , SimpleImputer(),['fever']), # in a fever column by the help
    # of SI we
    # fill the missing values by mean median mode
    ('tnf2',OrdinalEncoder(categories=[['Mild',"Strong"]]),['cough']),
    # by this processing we encode our data
    ('tnf3',OneHotEncoder(sparse_output=False,drop = 'first'),['gender','city'])
],remainder = 'passthrough') # reminder = passthrough ==>
# it means rest all the column remain same

In [20]:
transformer.fit_transform(x_train).shape

(80, 7)

In [21]:
transformer.transform(x_test).shape

(20, 7)

# Function Transformer


In [None]:
# The Function Transformer is a tool in scikit-learn , a popular Python library for
# machine learning , that allows you to apply a specified function to the input
# data The function Transformer can be useful for performing custom
# transformations of input data in a machine learning pipelines

In [None]:
# The function Transformer takes as input a single function that we applied to each sample
# in the data This function can be any Python function that takes a single argument such as
# a lambda function or a user defined function The function should return the transformed shape

In [24]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

# create a dataset
X = np.array([[1, 2], [3, 4]])

# define the transformation function
log_transform = FunctionTransformer(np.log1p)

# apply the transformation to the dataset
X_transformed = log_transform.transform(X)

# view the transformed data
print(X_transformed)

[[0.69314718 1.09861229]
 [1.38629436 1.60943791]]


In [27]:
# 1> Custom Feature Engineering

from sklearn.preprocessing import FunctionTransformer
import numpy as np

# create a dataset
x = np.array([[1,2],[3,4]])
# define a custom feature engineering function
def my_engineering(x):
  return np.hstack((x,x**2))
# create a Function Transformer to apply the custom function
custom_transformer = FunctionTransformer(my_engineering)

# apply the transform to the input data
x_transformed = custom_transformer.transform(x)

# view the transform data
print(x_transformed)

[[ 1  2  1  4]
 [ 3  4  9 16]]


In [29]:
# 2> Scaling And Normalization

from sklearn.preprocessing import FunctionTransformer
import numpy as np

# create a dataset
x = np.array([[1,2],[3,4]])
# define a custom feature scalling function
def my_scalling(x):
  return x/np.max(x)
# create a Function Transformer to apply the custom function
custom_transformer = FunctionTransformer(my_scalling)

# apply the transform to the input data
x_transformed = custom_transformer.transform(x)

# view the transform data
print(x_transformed)

[[0.25 0.5 ]
 [0.75 1.  ]]


In [30]:
# 3> Data Cleaning

from sklearn.preprocessing import FunctionTransformer
import numpy as np

# create a dataset
x = np.array([[1,2],[3,np.nan]])
# define a custom feature cleaning function
def my_cleaning(x):
  x[np.isnan(x)]= 0
  return x
# create a Function Transformer to apply the custom function
custom_transformer = FunctionTransformer(my_cleaning)

# apply the transform to the input data
x_transformed = custom_transformer.transform(x)

# view the transform data
print(x_transformed)

[[1. 2.]
 [3. 0.]]


In [37]:
import numpy as np
import pandas as pd

In [38]:
df=pd.read_csv('/content/placement.csv')
df.head(3)

Unnamed: 0,cgpa,resume_score,placed
0,8.14,6.52,1
1,6.17,5.17,0
2,8.27,8.86,1


In [39]:
x = df.drop(columns = ['placed'])
y = df['placed']

In [40]:
from sklearn.preprocessing import FunctionTransformer

In [41]:
log_transform = FunctionTransformer(np.log1p)

# apply the transformation to the dataset
X_transformed = log_transform.transform(x)


In [42]:
X_transformed

Unnamed: 0,cgpa,resume_score
0,2.212660,2.017566
1,1.969906,1.819699
2,2.226783,2.288486
3,2.064328,2.112635
4,2.142416,2.116256
...,...,...
95,1.991976,1.998774
96,2.222459,2.170196
97,2.034706,2.172476
98,2.212660,1.891605
