# Include-Prep-Data-For-ML-Models

In [None]:
#### This is a single longish function that creates inputs for any ML model ####
#### For unsupervised models, the entire dataset is returned ####

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import math

def create_model_input(df_dataset, 
                       target_feature,
                       target_feature_type, 
                       categorical_features_list, 
                       numerical_features_list, 
                       scaler='Standard'
                      ):
    
    # df_dataset is a complete pre-processed input dataset, e.g., df_ig
    # target_feature is the feature that is the target for the model
    #### NOTE: if target_feature is '', then the entire dataset is returned after been one-hot encoded and scaled ####
    # target_type is 'categorical' or 'numerical' or '' (when there is no target specified)
    # categorical_features_list is the list of categorical features used by the model 
    #  (will include the target feature if the target feature is categorical)
    # numerical_features_list is the list of numerical features used by the model
    #  (will include the target feature if the target feature is numerical)
    # scaler can be 'Standard' (default) or 'MinMax'
    
    #### Step 1: Decide how to split the dataset into train, validate, and test datasets ####
    # VAL_PCT_SPLIT can be set to 0.0 if needed
    TRAIN_PCT_SPLIT = 0.8
    VAL_PCT_SPLIT = 0.0
    TEST_PCT_SPLIT = 0.2
    
    #### Step 2: Separate the target feature from the other features ####
    if target_feature != '':
        categorical_features = [x for x in categorical_features_list if x != target_feature]
        numerical_features = [x for x in numerical_features_list if x != target_feature]
    else:
        categorical_features = categorical_features_list
        numerical_features = numerical_features_list
    
    #### Step 3: Create a dataset with the requisite features for the model from the full dataset ####
    if target_feature != '':
        df_model = df_dataset[categorical_features + numerical_features + [target_feature]]
    else:
        df_model = df_dataset[categorical_features + numerical_features]
    
    #### Step 4: One-hot-encode the categorical features ####
    df_model = pd.get_dummies(df_model, columns=categorical_features)
    
    #### Step 5: Label encode the target feature if it's a categorical feature ####
    if target_feature_type == 'categorical':
        le = LabelEncoder()
        df_model[target_feature] = le.fit_transform(df_model[target_feature])
        
    #### df_model now contains all the features and the target we need
    ####  in addition, df_model has its categorical features one-hot-encoded and 
    ####  its label/target encoded if needed
    
    #### Step 6: Shuffle the dataset and split it into train, val, and test ####
    # Shuffle the one-hot-encoded and label-encoded dataset
    df_shuff = shuffle(df_model, random_state=42) # set seed for replicability
    # Why 42? It's the answer to the "ultimate question of life, the universe, and everything" as worked out by 
    ## the supercomputer Deep Thought in Douglas Adams' The Hitchiker's Guide to the Universe.
    
    (num_rows, num_cols) = df_shuff.shape
    
    num_train = math.floor(TRAIN_PCT_SPLIT * num_rows)
    num_val = math.floor(VAL_PCT_SPLIT * num_rows)
    # num_test consists of the remaning rows of the dataset
    num_test = num_rows - (num_train + num_val)
    
    # Train, val, and test dataframes
    df_train = df_shuff.iloc[0:num_train]
    df_val = df_shuff.iloc[num_train:num_train+num_val]
    df_test = df_shuff.iloc[num_train+num_val: ]
    
    # df_val_test combines df_val and df_test in case we don't need them separately
    # . e.g., when using k-fold cross validation with a scikit classifier
    # Typically used when the dataset is small
    df_val_test = pd.concat([df_val, df_test], axis=0)
    
    # Use df_train_val to (re)train the optimal model once the optimal model 
    #  has been determined using grid search
    df_train_val = pd.concat([df_train, df_val], axis=0)
    
    # And finally, this is the entire dataset (for unsupervised learning, e.g., clustering analysis)
    df_full = pd.concat([df_train_val, df_test], axis=0)
     
    #### Step 8: Scale the numerical features OVER THE TRAINING DATASET ONLY ####
    if scaler == 'Standard':
        sc = StandardScaler()
    elif scaler == 'MinMax':
        sc = MinMaxScaler()
    else:
        sc = StandardScaler() # use StandardScaler as the default scaler
    
    #### NOTE: a copy is made to aviod the pandas SettingWithCopying warning ####
    #### See https://www.dataquest.io/blog/settingwithcopywarning/ ####
    if target_feature == '':
        # Scale the entire dataset's numerical features
        df_full_scaled = df_full.copy()
        df_full_scaled[numerical_features] = sc.fit_transform(df_full[numerical_features])
    else:
        df_full_scaled = df_full
    
    # Scale just the training dataset and use these scaler values to scale the val and test datasets
    df_train_scaled = df_train.copy()
    df_train_scaled[numerical_features] = sc.fit_transform(df_train[numerical_features])

    
    #### Step 9: Scale the numerical features of the other datasets using the scaler values
    ####  of the training dataset ####
    #### NOTE: a copy is made to aviod the pandas SettingWithCopying warning ####
    #### See https://www.dataquest.io/blog/settingwithcopywarning/ ####
    
    # Check to make sure that the validation slice % is not 0
    if len(df_val) > 0:
        df_val_scaled = df_val.copy()
        df_val_scaled[numerical_features] = sc.transform(df_val[numerical_features])
    else:
        df_val_scaled = df_val
    
    df_test_scaled = df_test.copy()
    df_test_scaled[numerical_features] = sc.transform(df_test[numerical_features])
    
    df_val_test_scaled = df_val_test.copy()
    df_val_test_scaled[numerical_features] = sc.transform(df_val_test[numerical_features])
    
    df_train_val_scaled = df_train_val.copy()
    df_train_val_scaled[numerical_features] = sc.transform(df_train_val[numerical_features])
    
    #### Step 10: Get the targets for SciKit Learn models as a (num, ) shape array of reals ####
    #### there are no y values for the full dataset becuause there is no target ####
    if target_feature != '':
        y_train = df_train_scaled[target_feature].values.astype('float32')
        y_val = df_val_scaled[target_feature].values.astype('float32')
        y_test = df_test_scaled[target_feature].values.astype('float32')
        y_val_test = df_val_test_scaled[target_feature].values.astype('float32')
        y_train_val = df_train_val_scaled[target_feature].values.astype('float32')
    else:
        y_train = []
        y_val = []
        y_test = []
        y_val_test = []
        y_train_val = []
    
    #### Step 11: Create the input and target arrays ####
    # Get the feature array as it currently exists for the df_prepped_dataset
    #### NOTE: The feature names may have changed when the categorical features
    #### are one-hot-encoded
    # So features are now all column names EXCEPT for the Target
    features_list = list(df_train_scaled)
    if target_feature != '':
        features_list.remove(target_feature)
    
    X_train = df_train_scaled[features_list].values
    X_val = df_val_scaled[features_list].values
    X_test = df_test_scaled[features_list].values
    X_val_test = df_val_test_scaled[features_list].values
    X_train_val = df_train_val_scaled[features_list].values
    if target_feature == '':
        X_full = df_full_scaled[features_list].values
    else:
        X_full = []
    
    
    #### OUTPUTS ####
    dict_model_inputs = {'X_train': X_train, 
                         'X_val': X_val, 
                         'X_test': X_test, 
                         'X_val_test': X_val_test, 
                         'X_train_val': X_train_val,
                         'X_full': X_full, 
                         'y_train': y_train, 
                         'y_val': y_val, 
                         'y_test': y_test, 
                         'y_val_test': y_val_test, 
                         'y_train_val': y_train_val
                        }
    
    return dict_model_inputs