In [114]:
from sklearn import datasets
iris = datasets.load_iris()
df = pd.DataFrame(iris['data'], columns=iris['feature_names'])


In [131]:
import pandas as pd
pd.options.display.max_rows = 50
pd.set_option("expand_frame_repr", False)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


class data_preprocesser:

    def __init__(self, df):

        # Preprocessing for numerical data
        numerical_transformer = SimpleImputer(strategy='constant')

        # Preprocessing for categorical data
        categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))]) 

        # Bundle preprocessing for numerical and categorical data
        preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols),('cat', categorical_transformer, categorical_cols)])

        pass

In [8]:

def calc_missing_pct(train, test, features, show_all=False):
    '''
    Method for calculating the missing percentages from both test and train datasets
    given a list of features to include
    Option for showing all features or only those with missing values (Default)
    Params:
        ► train (Pandas DataFrame) | training data set
        ► test (Pandas DataFrame) | testing dataset
        ► features (list) | List of features (Column names (str))
        ► show_al (bool) | Optional True/False for showing all features or only those with missing values (Default hide)
    Return:
        ► Pandas DataFrame with column names as the index and 2 columns representing train and test dataset missing percentages
    
    '''
    
    # Calculate the missing percentages for both train and test data
    train_missing_pct = train[features].isnull().mean() * 100
    test_missing_pct = test[features].isnull().mean() * 100

    # Combine the missing percentages for train and test data into a single dataframe
    missing_pct_df = pd.concat([train_missing_pct, test_missing_pct], axis=1, keys=['Train%', 'Test%'])

    # If show_all is True, then print full dataframe, else print a filtered one showing only non-zero percentages
    if not show_all:

        # Print the missing percentage dataframe (Excl. features without missing values)
        missing_features = missing_pct_df[(missing_pct_df['Train%'] > 0) | (missing_pct_df['Test%'] > 0)]
    else:
        missing_features = missing_pct_df
    
    print(missing_features)
    
    return missing_features
    

In [4]:
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(model, X_train, X_valid, y_train, y_valid):
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [3]:
from sklearn.ensemble import RandomForestRegressor

def drop_cols(df):

    # Get names of columns with missing values
    cols_with_missing = [col for col in df.columns if df[col].isnull().any()]

    # Drop columns in training and validation data
    return df.drop(cols_with_missing, axis=1)


In [None]:
from sklearn.impute import SimpleImputer

def impute_cols(df):
    
    # Imputation
    my_imputer = SimpleImputer()
    imputed_df = pd.DataFrame(my_imputer.fit_transform(df))

    # Imputation removed column names; put them back
    imputed_df.columns = df.columns  

    return imputed_df

In [None]:
def impute_extend(df):
    # Make copy to avoid changing original data (when imputing)
    df_plus = df.copy()
    
    # Get names of columns with missing values
    cols_with_missing = [col for col in df.columns if df[col].isnull().any()]

    # Make new columns indicating what will be imputed
    for col in cols_with_missing:
        df_plus[col + '_was_missing'] = df_plus[col].isnull()
    
    # Imputation
    my_imputer = SimpleImputer()
    imputed_df_plus = pd.DataFrame(my_imputer.fit_transform(df_plus))

    # Imputation removed column names; put them back
    imputed_df_plus.columns = df_plus.columns



In [39]:
class get_cat_columns_short:
    
    cardinality_limit = 10

    def __init__(self, train, test, cardinality_limit=10):

        # Categorical columns in the training data
        self.object_cols = [col for col in train.columns if train[col].dtype == "object"]

        # Columns that can be safely ordinal encoded (If columns exist within both datasets)
        self.good_label_cols = [col for col in self.object_cols if 
                        set(test[col]).issubset(set(train[col]))]
                
        # Problematic columns that will be dropped from the dataset
        self.bad_label_cols = list(set(self.object_cols)-set(self.good_label_cols))

        # Columns that will be one-hot encoded
        self.low_cardinality_cols = [col for col in self.object_cols if train[col].nunique() < cardinality_limit]

        # Columns that will be dropped from the dataset
        self.high_cardinality_cols = list(set(self.object_cols)-set(self.low_cardinality_cols))

In [None]:
from sklearn.preprocessing import OneHotEncoder

def apply_one_hot_encoder(train, test):

    cl = get_cat_columns_short(train, test)

    # Categorical columns in the training data
    object_cols = cl.object_cols

    # Columns that can be safely ordinal encoded (If columns exist within both datasets)
    good_label_cols = cl.good_label_cols
            
    # Problematic columns that will be dropped from the dataset
    bad_label_cols = cl.bad_label_cols

    # Columns that will be one-hot encoded
    low_cardinality_cols = cl.low_cardinality_cols

    # Columns that will be dropped from the dataset
    high_cardinality_cols = cl.high_cardinality_cols

    # Apply one-hot encoder to each column with categorical data
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train[low_cardinality_cols]))
    OH_cols_valid = pd.DataFrame(OH_encoder.transform(test[low_cardinality_cols]))

    # One-hot encoding removed index; put it back
    OH_cols_train.index = train.index
    OH_cols_valid.index = test.index

    # Remove categorical columns (will replace with one-hot encoding)
    num_X_train = train.drop(object_cols, axis=1)
    num_X_valid = test.drop(object_cols, axis=1)

    # Add one-hot encoded columns to numerical features
    OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
    OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

    # Ensure all columns have string type
    OH_X_train.columns = OH_X_train.columns.astype(str)
    OH_X_valid.columns = OH_X_valid.columns.astype(str)