<a href="https://colab.research.google.com/github/jagadish9084/ml-practice/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [15]:
class DataPreprocessor:
  def __init__(self, data, target_variable, should_handle_missing_value =True):
    self.data = data
    self.target_variable = target_variable
    self.should_handle_missing_value = should_handle_missing_value
  # Find columns with missing values
  def find_columns_with_missing_values(self):
    return [col for col, count in self.data.isna().sum().items() if count > 0]

  # Handle missing values for both numerical and categorical columns
  def hanlde_missing_values(self):
    columns_to_be_handled = self.find_columns_with_missing_values()
    print(f'Columns with missing values before cleaning:{columns_to_be_handled}')
    for col in columns_to_be_handled:
      if self.data[col].dtype == 'int' or self.data[col].dtype=='float':
        self.data[col].fillna(self.data[col].mean(), inplace =True)
      else:
        self.data[col].fillna(self.data[col].mode()[0], inplace =True)
    columns_to_be_handled = self.find_columns_with_missing_values()
    print(f'Columns with missing values after cleaning:{columns_to_be_handled}')

  # Peroform OneHutEncoding for categorical columns
  def encode_categories(self, cat_col):
    print(f'Categorical columns to be encoded: {cat_col}')
    encoder = OneHotEncoder(drop='first', sparse_output=False, dtype ='int')
    out = encoder.fit_transform(self.data[cat_col])
    encoded_dataframe = pd.DataFrame(out, columns=encoder.get_feature_names_out())
    encoded_dataframe.columns = encoded_dataframe.columns.str.replace('\w+_','', regex=True)
    return encoded_dataframe

  def preprocess(self):

    if self.should_handle_missing_value:
      self.hanlde_missing_values()

    cat_col =  [col for col in self.data.columns if self.data[col].dtype !='float' and self.data[col].dtype !='int']
    encoded_data = self.encode_categories(cat_col)

    # Drop encoded columns from original data set
    self.data.drop(cat_col, axis=1, inplace=True)

    #Concat original data set with encoded columns
    self.data = pd.concat([self.data, encoded_data], axis=1)

    #Seperate dependent and independent variable
    y, x = self.data[self.target_variable], self.data.drop([self.target_variable], axis=1)

    # Sweep the original data from memory
    del self.data
    # Split traing and testing data
    self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size = 0.30, random_state=42)

    # Standardize the data
    scalar  = StandardScaler()
    self.x_train = pd.DataFrame(scalar.fit_transform(self.x_train), columns= scalar.get_feature_names_out())
    self.x_test = pd.DataFrame(scalar.transform(self.x_test), columns= scalar.get_feature_names_out())


In [None]:
data_set = sns.load_dataset('flights')
target_variable = 'passengers'
preprocessor = DataPreprocessor(data_set, target_variable)
preprocessor.preprocess()
preprocessor.y_test