## 1. Data Loading and Exploration

#### (a) Load the "employee_attrition.csv" dataset into a Python environment using pandas.

In [1]:
import pandas as pd
from IPython.display import display, HTML  # for showing data frames in a readable way

# Load the dataset
df = pd.read_csv('employee_attrition.csv')
pd.set_option('display.max_columns', 999) # don't hide columns of data

#### (b) Explore - show the shape and a few rows

In [13]:
# Show the shape
print(f'(rows, fields): {df.shape}\n')

# ... every 100'th record ...
# [i for i in range(len(df)) if i % 100 == 0]

# Show the first & last 2 rows
display(df.iloc[[0,1,-2,-1]])
print('\nData Types:\n')
#print(pd.DataFrame([list(df.dtypes)], columns=df.columns, index=['']))
print({c:str(d) for c,d in zip(df.columns, df.dtypes)})

(rows, fields): (1470, 35)



Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,4,Male,63,2,2,Sales Executive,2,Married,5390,13243,2,Y,No,14,3,4,80,0,17,3,2,9,6,0,8
1469,34,No,Travel_Rarely,628,Research & Development,8,3,Medical,1,2068,2,Male,82,4,2,Laboratory Technician,3,Married,4404,10228,2,Y,No,12,3,1,80,0,6,3,4,4,3,1,2



Data Types:

{'Age': 'int64', 'Attrition': 'object', 'BusinessTravel': 'object', 'DailyRate': 'int64', 'Department': 'object', 'DistanceFromHome': 'int64', 'Education': 'int64', 'EducationField': 'object', 'EmployeeCount': 'int64', 'EmployeeNumber': 'int64', 'EnvironmentSatisfaction': 'int64', 'Gender': 'object', 'HourlyRate': 'int64', 'JobInvolvement': 'int64', 'JobLevel': 'int64', 'JobRole': 'object', 'JobSatisfaction': 'int64', 'MaritalStatus': 'object', 'MonthlyIncome': 'int64', 'MonthlyRate': 'int64', 'NumCompaniesWorked': 'int64', 'Over18': 'object', 'OverTime': 'object', 'PercentSalaryHike': 'int64', 'PerformanceRating': 'int64', 'RelationshipSatisfaction': 'int64', 'StandardHours': 'int64', 'StockOptionLevel': 'int64', 'TotalWorkingYears': 'int64', 'TrainingTimesLastYear': 'int64', 'WorkLifeBalance': 'int64', 'YearsAtCompany': 'int64', 'YearsInCurrentRole': 'int64', 'YearsSinceLastPromotion': 'int64', 'YearsWithCurrManager': 'int64'}


#### (c) Identify the target variable (attrition) and the feature variables.

In [15]:
target_variable = 'Attrition'
feature_variables = list(df.columns[df.columns != target_variable])
print(f'\nTarget variable: {type(target_variable)}\n\t{target_variable}')
print(f'\nFeature variables:  {type(feature_variables)}\n\t{feature_variables}')


Target variable: <class 'str'>
	Attrition

Feature variables:  <class 'list'>
	['Age', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']


## 2. Data Preprocessing

#### (a) Handle missing values, if any, by either dropping the corresponding rows or filling them with appropriate values.

In [17]:
# what is a "missing value" ... zero?  null?  empty_string? ...

if df.isnull().sum().sum() == 0:
    print('No null values')
else:
    print('Null values found! Removing rows with null values.')
    print(f'shape before drop: {df.shape}')
    df.dropna(inplace=True)
    print(f'shape after drop: {df.shape}')


No null values


#### (b) Perform categorical encoding for the categorical features using techniques like one-hot encoding or label encoding.

In [18]:
# We categorize the features/fields into three categories (ordered, categoriecal, unrelated)
# we thus will use ordinal, label, one-hot, respectively 
# This is a manual decision and will go like so
list(df.columns)

['Age',
 'Attrition',
 'BusinessTravel',
 'DailyRate',
 'Department',
 'DistanceFromHome',
 'Education',
 'EducationField',
 'EmployeeCount',
 'EmployeeNumber',
 'EnvironmentSatisfaction',
 'Gender',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobRole',
 'JobSatisfaction',
 'MaritalStatus',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'Over18',
 'OverTime',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StandardHours',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

4. Hyperparameter - Basically, anything in machine learning and deep learning that you decide their values or choose their configuration before training begins and whose values or configuration will remain the same when training ends is a hyperparameter.  https://towardsdatascience.com/parameters-and-hyperparameters-aa609601a9ac?gi=bf02696c756c