<a href="https://colab.research.google.com/github/jagadish9084/python-core/blob/main/Data_Preprocessing_data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [208]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [209]:
class DataPreprocessor:
  def __init__(self, data):
    self.data=data

  # Find columns with missing values
  def find_columns_with_missing_values(self):
    return [col for col, val in self.data.isna().sum().items() if val > 0]

  # Handle missing values for both numerical and categorical columns
  def hanlde_missing_data(self):
    null_columns_to_be_handled = self.find_columns_with_missing_values()
    print(f'Columns with missing values before cleaning:{null_columns_to_be_handled}')
    for col in null_columns_to_be_handled:
      if self.data[col].dtype == 'int' or self.data[col].dtype=='float':
        self.data[col].fillna(self.data[col].mean(), inplace =True)
      else:
        self.data[col].fillna(self.data[col].mode()[0], inplace =True)
    null_columns_to_be_handled = self.find_columns_with_missing_values()
    print(f'Columns with missing values after cleaning:{null_columns_to_be_handled}')

  # Peroform OneHutEncoding for categorical columns
  def encode_categories(self, cat_col):
    print(f'Categorical columns to be encoded: {cat_col}')
    encoder = OneHotEncoder(drop='first', sparse_output=False, dtype ='int')
    out = encoder.fit_transform(self.data[cat_col])
    encoded_dataframe = pd.DataFrame(out, columns=encoder.get_feature_names_out())
    encoded_dataframe.columns = encoded_dataframe.columns.str.replace('\w+_','', regex=True)
    return encoded_dataframe

  def preprocess(self, dependent_var, should_handle_missing_data = True):

    if should_handle_missing_data:
      self.hanlde_missing_data()

    cat_col =  [col for col in self.data.columns if self.data[col].dtype !='float' and self.data[col].dtype !='int']
    encoded_data = self.encode_categories(cat_col)

    # Drop encoded columns from original data set
    self.data.drop(cat_col, axis=1, inplace=True)

    #Concat original data set with encoded columns
    self.data = pd.concat([self.data, encoded_data], axis=1)

    #Seperate dependent and independent variable
    y, x = self.data[dependent_var], self.data.drop([dependent_var], axis=1)

    # Sweep the original data
    del self.data
    # Split traing and testing data
    self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size = 0.30, random_state=42)

    # Standardize the data
    scalar  = StandardScaler()
    self.x_train = pd.DataFrame(scalar.fit_transform(self.x_train), columns= scalar.get_feature_names_out())
    self.x_test = pd.DataFrame(scalar.transform(self.x_test), columns= scalar.get_feature_names_out())


In [210]:
data = sns.load_dataset('penguins')
dependent_var = 'body_mass_g'
processor =  DataPreprocessor(data)
processor.preprocess(dependent_var)
processor.x_train

Columns with missing values before cleaning:['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex']
Columns with missing values after cleaning:[]
Categorical columns to be encoded: ['species', 'island', 'sex']


Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,Chinstrap,Gentoo,Dream,Torgersen,Male
0,-1.224858,0.514465,-1.634564,-0.5,-0.774597,1.314257,-0.392232,0.967204
1,0.385744,-1.004570,0.968116,-0.5,1.290994,-0.760886,-0.392232,0.967204
2,0.277164,-1.055205,1.319830,-0.5,1.290994,-0.760886,-0.392232,0.967204
3,1.453446,0.514465,-0.016682,2.0,-0.774597,1.314257,-0.392232,0.967204
4,0.602903,-0.903301,1.460515,-0.5,1.290994,-0.760886,-0.392232,0.967204
...,...,...,...,...,...,...,...,...
235,0.657193,0.615734,-0.438738,2.0,-0.774597,1.314257,-0.392232,-1.033908
236,-0.772442,0.666369,-0.790451,-0.5,-0.774597,-0.760886,2.549510,0.967204
237,-0.971505,0.058755,-0.157367,-0.5,-0.774597,-0.760886,-0.392232,-1.033908
238,0.476227,-1.460281,0.616403,-0.5,1.290994,-0.760886,-0.392232,-1.033908
