### Import modules

In [110]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt


### Import Dataset

In [111]:
 pd.set_option('display.max_columns', None)
workers_df = pd.read_csv("../data/raw/factory_workers.csv", sep=',', encoding="mac_roman")

In [112]:
workers_df.head()

### Drop Unnecessary Columns

In [113]:
# drop unnecessary columns
unnecessary_cols = ['sub_fname', 'sub_lname','sub_coll_IDs', 'sub_colls_same_sex_prtn','sup_fname', 'sup_lname','event_week_in_series',
 'event_day_in_series','event_weekday_num', 'event_weekday_name', 'recorded_note_from_sup', 'record_conf_matrix_h', 
                   'behav_cause_h', 'record_cause']

workers_df.drop(columns=unnecessary_cols, inplace=True)

In [114]:
# drop all rows with production director as sub_role
workers_df = workers_df[workers_df['sub_role'] != 'Production Director']

In [115]:
# check df
workers_df.head()

### Create target variable column

In [116]:
# create a binary column based on whether a worker resigned or not
workers_df['Resigned'] = [1 if x == 'Resignation' else 0 for x in workers_df['behav_comptype_h']]

### Feature Engineering: Encode Categorical Variables

In [117]:
# Get a list of relevant events
relevant_events = list(workers_df['behav_comptype_h'].unique())
sabotage = relevant_events[-1]
relevant_events = relevant_events[:10]
relevant_events.append(sabotage)
relevant_events

***Count the number of instances each employee had an underrecorded efficacy score***

In [118]:
# Find a count of underrecorded efficacy for each employee
underrecorded_efficacies = workers_df[workers_df['recorded_efficacy'] < workers_df['actual_efficacy_h']]
underrecorded_efficacies

# Group underrecorded efficacies by sub_ID and get count
efficacy = underrecorded_efficacies.groupby('sub_ID')['recorded_efficacy'].count().reset_index()

# Rename col
efficacy.rename(columns={'recorded_efficacy':'Num Underrecorded Efficacy'}, inplace=True)
efficacy.sort_values(by='Num Underrecorded Efficacy', ascending=False).head()

***Count the number of instances each employee had a mismatched relevant event recorded***

In [119]:
# Filter df based on relevant events
relevant_events_df = workers_df[workers_df['behav_comptype_h'].isin(relevant_events)]

In [120]:
# Find a count of mismatched recorded events for each employee
mismatched_events = relevant_events_df[relevant_events_df['behav_comptype_h'] != relevant_events_df['record_comptype']]

# Group by sub_ID and get a count of each mismatched event
events = mismatched_events.groupby('sub_ID')['behav_comptype_h'].count().reset_index()

# Rename column and sort
events.rename(columns={'behav_comptype_h':'Num Mismatched Events'}, inplace=True)
events.sort_values('Num Mismatched Events', ascending=False).head()

In [121]:
# Merge Mismatched Efficacies and Events 
new_features = efficacy.merge(events, left_on='sub_ID', right_on='sub_ID', how='left')
new_features

In [122]:
# fill missing values for Mismatched events column with 0
new_features.fillna(0, inplace=True)

# Convert data type to int for mismatched events col
new_features['Num Mismatched Events'] = new_features['Num Mismatched Events'].astype('int')
new_features.dtypes

In [123]:
# Merge new features to original dataframe
merged_df = workers_df.merge(new_features, left_on='sub_ID', right_on='sub_ID')

### Drop duplicate rows and some Categorical Features

In [124]:
# drop features already encoded
features_to_drop = ['behav_comptype_h', 'record_comptype', 'actual_efficacy_h', 'recorded_efficacy']
merged_df.drop(columns=features_to_drop, inplace=True)

In [125]:
# drop duplicate rows based on last event date
df = merged_df.sort_values('event_date').drop_duplicates('sub_ID', keep='last')
df = df.sort_values('sub_ID')

# Convert event_date col to datetime 
df['event_date'] = pd.to_datetime(df['event_date'])

### Create Dummy Variables for Categorical Features

In [126]:
# Features to encode
cat_vars = ['sub_sex', 'sub_shift', 'sub_team', 'sub_role', 'sub_workstyle_h', 'sup_sex', 'sup_role']

In [127]:
# Get dummy variables
df = pd.get_dummies(df, columns=cat_vars, prefix=cat_vars)

In [128]:
# Inspect encoded features
df.head()

In [129]:
# drop dummy feature with production director supervisor role
df.drop(columns=['sup_role_Production Director'], inplace=True)

In [130]:
# Check data types
df.info()

### Fix data types

In [131]:
# get features with object types to convert to int
o_types = df.select_dtypes(include=['object'], exclude=['datetime']).columns.to_list()

# convert to num type
df[o_types] = df[o_types].apply(pd.to_numeric, errors='coerce')

### Create train and test splits

In [132]:
# Create target variable and feature matrix
X = df.drop(columns=['sub_ID', 'sup_ID', 'event_date', 'Resigned'])
y = df['Resigned']

In [133]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [134]:
# View train set
X_train.head()

### Preprocessing: Standardize Numeric Features

In [135]:
# Get numeric features
num_features = ['sub_age', 'sub_health_h', 'sub_commitment_h',
       'sub_perceptiveness_h', 'sub_dexterity_h', 'sub_sociality_h',
       'sub_goodness_h', 'sub_strength_h', 'sub_openmindedness_h',
       'Num Underrecorded Efficacy', 'Num Mismatched Events', 'sup_age', 'sup_sub_age_diff',
               'sup_commitment_h', 'sup_perceptiveness_h', 'sup_goodness_h']

In [136]:
# instantiate scaler
SS_scaler = StandardScaler()

# fit values to scaler
SS_scaler.fit(X_train[num_features])

# Get standardized values as a new df
X_train[num_features] = SS_scaler.transform(X_train[num_features])


In [137]:
# View train set with updated scaled values
X_train

In [138]:
# Show new standardized features
scaled_data = pd.DataFrame(X_train[num_features], columns=num_features)

In [139]:
# Visualize shape of features 
scaled_data.hist()
plt.show()

In [140]:
X_train

### Check for Outliers with Boxplots

In [141]:
for feature in X_train[num_features].columns:
    X_train[[feature]].boxplot()
    plt.show()

## Export train and test data

In [142]:
try:
    X_train.to_csv('../data/processed/X_train.csv', index=False)
    X_test.to_csv('../data/processed/X_test.csv', index=False)
    y_train.to_csv('../data/processed/y_train.csv', index=False)
    y_test.to_csv('../data/processed/y_test.csv', index=False)
except:
    print('Could not export.')