# Data Preprocessing

## Importing Data
---
Complete the following:
- Use the parameter 'usecols' to select all columns from the raw data that are needed
- Use the parameter 'parse_dates' to have Pandas automatically parse date info as it is brought in
- Use the paremeter 'index_col' to set the index to the datetime column if this is time series data
- Use the .query() function to import data that's conditional upon another columns values
- Anonymize or remove sensitive data
- Remove unneeded columns such as timestamps, counts, etc. that are guaranteed to have no relationship on the target

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
from scipy import stats
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import seaborn as sns # Used for correlation heat map
from sklearn.datasets import make_classification
from skorch import NeuralNetClassifier, NeuralNetRegressor
from torch import nn
import torch.nn.functional as F


raw_data = pd.read_csv('./data/TrainingSet.csv')
if isinstance(raw_data, pd.DataFrame):
    print("Data successfully imported.")
else:
    print("Data failed to import.")


test_set = pd.read_csv('./data/TestSet.csv')
if isinstance(test_set, pd.DataFrame):
    print("Test data successfully imported.")
else:
    print("Test data failed to import.")
    
# Remove unneeded columns
del raw_data['timestamp']
del test_set['timestamp']

# Time series example
# hourly_weather_data = pd.read_csv('./data/raw_weather_data.csv', usecols=['DATE','REPORT_TYPE','HourlyDryBulbTemperature', 'HourlyPrecipitation'] , parse_dates=["DATE"], index_col="DATE").query("REPORT_TYPE == 'FM-15'")

print("Data Shape:",raw_data.shape) 
print("Test Shape:",test_set.shape)


  interactivity=interactivity, compiler=compiler, result=result)


Data successfully imported.


  interactivity=interactivity, compiler=compiler, result=result)


Test data successfully imported.
Data Shape: (20000, 379)
Test Shape: (20000, 379)


## Separate Data into Training, Validation, Test, and Target Sets

In [2]:
from sklearn.model_selection import train_test_split

# Separate data into training, validation, and test sets
train_set, validation_set, train_targets, validation_targets = train_test_split(raw_data, raw_data['job_performance'], test_size=0.2)

# Set target and drop from training/test set data
del train_set['job_performance']
del validation_set['job_performance']
del test_set['job_performance']

print("Training Set Shape:",train_set.shape)
print("Validation Set Shape:",validation_set.shape)
print("Test Set Shape:",test_set.shape)
print(train_targets.head(5))
print(validation_targets.head(5))

Training Set Shape: (16000, 378)
Validation Set Shape: (4000, 378)
Test Set Shape: (20000, 378)
11605    2840.739854
14073    3060.220662
526      2871.751466
16969    2696.647037
14771    2474.468696
Name: job_performance, dtype: float64
18818    2447.550539
16329    2603.743230
9684     3008.842835
6938     2570.544318
8714     3312.652231
Name: job_performance, dtype: float64


## Preprocessing Pipeline

In [3]:
import pipeline_functions
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from pipeline_functions import Print, MissingValueRatioFilter, StartTimer, ForceToNumerical, \
ConvertToDataFrame, HighCorrelationFilter, OutputRunTime, ChangeDType

X = train_set.copy(deep=True)
V = validation_set.copy(deep=True)
T = test_set.copy(deep=True)

X['v71'] = pd.to_numeric(X['v71'], errors='coerce')
V['v71'] = pd.to_numeric(V['v71'], errors='coerce')
T['v71'] = pd.to_numeric(T['v71'], errors='coerce')


# Numerical transformations
numerical_missing_ratio = 0.5
variance_threshold = 0.01
numerical_colums = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']  
numerical_features = list(X.select_dtypes(include=numerical_colums).columns)  
numerical_transformer = Pipeline(steps=[
    ('print1', Print(message="  Preprocessing:")),
    ('print2', Print(message="    Numerical: Missing Value Ratio Filter (>"+str(numerical_missing_ratio)+")")),
    ('print3', Print(message="      Starting Numerical Features: ",columns=True)),
    ('missing_value_ratio_filter', MissingValueRatioFilter(ratio_missing=numerical_missing_ratio)),
    ('print4', Print(message="      Remaining Numerical Features:",columns=True)),
    ('print5', Print(message="    Numerical: Imputation")),
    ('imputer', SimpleImputer(strategy='mean')),
    ('print6', Print(message="    Numerical: Normalization")),
    ('scaler', MinMaxScaler()),
    ('print7', Print(message="    Numerical: Low Variance Filter (>"+str(variance_threshold)+")")),
    ('print8', Print(message="      Starting Numerical Features: ",columns=True)),
    ('variance_threshold', VarianceThreshold(threshold=variance_threshold)),
    ('print9', Print(message="      Remaining Numerical Features:",columns=True))
    ])

# Categorical transformations
categorical_missing_ratio = 0.5
categorical_variance_threshold = 0.01
categorical_features = X.select_dtypes(['object']).columns
categorical_transformer = Pipeline(steps=[
    ('print0', Print(message="    Categorical: Missing Value Ratio Filter (>"+str(categorical_missing_ratio)+")")),
    ('print1', Print(message="      Starting Categorical Features: ",columns=True)),
#     ('missing_value_ratio_filter', MissingValueRatioFilter(ratio_missing=categorical_missing_ratio)),
    ('print2', Print(message="      Remaining Categorical Features:",columns=True)),
    ('print3', Print(message="    Categorical: Imputation")),
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('print4', Print(message="    Categorical: Conversion of Ints to Strings")),
    ('change_dtype', ChangeDType()),
    ('print5', Print(message="    Categorical: One Hot Encoding")),
    ('print6', Print(message="      Starting Categorical Features:",columns=True)),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('print7', Print(message="      Remaining Categorical Features:",columns=True)),
    ('print8', Print(message="    Categorical: Low Variance Filter (>"+str(categorical_variance_threshold)+")")),
    ('print9', Print(message="      Starting Categorical Features: ",columns=True)),
#     ('variance_threshold', VarianceThreshold(threshold=categorical_variance_threshold)),
    ('print10', Print(message="      Remaining Categorical Features:",columns=True))
    ])

# Combine numerical and categorical data back together
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Master pipeline
high_correlation_filter_decimal = 0.9
master_pipeline = Pipeline([
    ('start_timer', StartTimer()),
    ('print1', Print(message="\nStarting Shape: ", return_shape=True)),
#     ('print2', Print(message="  Forcing column 'v71' to numerical data.")),
#     ('force_to_numerical', ForceToNumerical()),
    ('preprocessor', preprocessor),
    ('print3', Print(message="  Recombined Numerical & Categorical Shape: ",return_shape=True)),
    ('print4', Print(message="  Dimensionality Reduction: ")),
    ('convert_to_dataframe', ConvertToDataFrame()),
    ('print5', Print(message="    High Correlation Filter (> " + str(high_correlation_filter_decimal) + ")")),
    ('high_correlation_filter', HighCorrelationFilter(correlation_decimal=high_correlation_filter_decimal)),
    ('print6', Print(message="Final Shape:",return_shape=True)),
#     ('output_run_time', OutputRunTime(start_time=master_pipeline.named_steps['start_timer'].start_time))
])

# Run numerical data only
  # X = pd.DataFrame(numerical_transformer.fit_transform(cleaned_train_set[numerical_features]))
  # X.head(10)


##### Run on train set
##### Last runtime = 7,094 seconds
train_set_processed = pd.DataFrame(master_pipeline.fit_transform(X))
validation_set_processed = pd.DataFrame(master_pipeline.transform(V))
test_set_processed = pd.DataFrame(master_pipeline.transform(T))



Starting Shape: (16000, 378) 
  Forcing column 'v71' to numerical data. 
  Preprocessing: 
    Numerical: Missing Value Ratio Filter (>0.5) 
      Starting Numerical Features:  84
      Remaining Numerical Features: 36
    Numerical: Imputation 
    Numerical: Normalization 
    Numerical: Low Variance Filter (>0.01) 
      Starting Numerical Features:  36
      Remaining Numerical Features: 30
    Categorical: Missing Value Ratio Filter (>0.5) 
      Starting Categorical Features:  294
      Remaining Categorical Features: 294
    Categorical: Imputation 
    Categorical: Conversion of Ints to Strings 
    Categorical: One Hot Encoding 
      Starting Categorical Features: 294
      Remaining Categorical Features: 2769
    Categorical: Low Variance Filter (>0.01) 
      Starting Categorical Features:  2769
      Remaining Categorical Features: 2769
  Recombined Numerical & Categorical Shape:  (16000, 2799)
  Dimensionality Reduction:  
    Converted Matrix to DataFrame
    High Corre

## Save Pipeline Fit Values to File

In [5]:
from sklearn.externals import joblib

joblib.dump(master_pipeline, '2222_preprocessing_params.joblib')

['preprocessing_pipeline_v1.joblib']

## Load Pipeline File

In [6]:
pipeline = joblib.load('master_pipeline.joblib') 
test_set_processed = pipeline.transform(T)


Starting Shape: (16000, 378) 
  Forcing column 'v71' to numerical data. 
  Preprocessing: 
    Numerical: Missing Value Ratio Filter (>0.5) 
      Starting Numerical Features:  84
      Remaining Numerical Features: 36
    Numerical: Imputation 
    Numerical: Normalization 
    Numerical: Low Variance Filter (>0.01) 
      Starting Numerical Features:  36
      Remaining Numerical Features: 31
    Categorical: Missing Value Ratio Filter (>0.5) 
      Starting Categorical Features:  294
      Remaining Categorical Features: 294
    Categorical: Imputation 
    Categorical: Conversion of Ints to Strings 
    Categorical: One Hot Encoding 
      Starting Categorical Features: 294
      Remaining Categorical Features: 2762
    Categorical: Low Variance Filter (>0.01) 
      Starting Categorical Features:  2762
      Remaining Categorical Features: 2762
  Recombined Numerical & Categorical Shape:  (20000, 2793)
  Dimensionality Reduction:  
    Converted Matrix to DataFrame
    High Corre

## Save Preprocessed Data to CSV

In [7]:
train_set_processed.to_csv(r'./data/2769_Preprocessed_TrainingSet.csv', index=False)
validation_set_processed.to_csv(r'./data/2769_Preprocessed_ValidationSet.csv', index=False)
test_set_processed.to_csv(r'./data/2769_Preprocessed_TestSet.csv', index=False)
# train_targets.to_csv(r'./data/Preprocessed_TrainingTargets.csv', header=['job_performance'], index=False)
# test_targets.to_csv(r'./data/Preprocessed_TestingTargets.csv', header=['job_performance'], index=False)

## Filled NAs with mean
#1815 (7,094 seconds) was with 0.7 missing ratio filter and .01 variance filter on numerical, no filters on categorical, high correlation filter of 0.9
#1616 (4,155 seconds) was with 0.5 missing ratio filter and .01 variance filter on numerical, no filters on categorical, high correlation filter of 0.7