# Import danych & normalizacja

In [11]:
import pandas as pd

df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

# Normalizacja string -> boolean
df = df.replace({ 'Yes': True })
df = df.replace({ 'No': False })

# Normalizacja string -> enum (BusinessTravel)
df = df.replace({ 'Non-Travel': 0 })
df = df.replace({ 'Travel_Rarely': 1 })
df = df.replace({ 'Travel_Frequently': 2 })

# Normalizacja string -> enum (Department)
df['Department'] = df['Department'].replace({ 'Sales': 0 })
df = df['Department'].replace({ 'Research & Development': 1 })
df = df['Department'].replace({ 'Human Resources': 2 })

# Usunięcie kolumn, które dla każdego rekordu posiadają niezmienną, stałą wartość (1, 80, ID, 1)
df = df.drop(['EmployeeCount', 'StandardHours', 'EmployeeNumber', 'Over18'], 1) # y-axis

# Enumeracja nazw kolumn
column_count = 0
for column in df:
    print(column)
    column_count += 1
  
print(column_count)

df.to_csv('dataset-normalized.csv')

Age
Attrition
BusinessTravel
DailyRate
Department
DistanceFromHome
Education
EducationField
EnvironmentSatisfaction
Gender
HourlyRate
JobInvolvement
JobLevel
JobRole
JobSatisfaction
MaritalStatus
MonthlyIncome
MonthlyRate
NumCompaniesWorked
OverTime
PercentSalaryHike
PerformanceRating
RelationshipSatisfaction
StockOptionLevel
TotalWorkingYears
TrainingTimesLastYear
WorkLifeBalance
YearsAtCompany
YearsInCurrentRole
YearsSinceLastPromotion
YearsWithCurrManager
31


# Opis danych

In [31]:
df.describe()['DistanceFromHome']

count    1470.000000
mean        9.192517
std         8.106864
min         1.000000
25%         2.000000
50%         7.000000
75%        14.000000
max        29.000000
Name: DistanceFromHome, dtype: float64

# Korelacja

In [22]:
corr_table = df.corr()

# corr_table = corr_table.drop(['EmployeeCount', 'StandardHours'], 0) # x-axis
# corr_table = corr_table.drop(['EmployeeCount', 'StandardHours'], 1) # y-axis

def corr_color(val):
    if abs(val) == 1:
        color = '#666'
    elif abs(val) > 0.8:
        color = '#9d0208'
    elif abs(val) > 0.5:
        color = '#e85d04'
    elif abs(val) > 0.3:
        color = '#f48c06'
    else:
        color = 'black'
    return 'color: %s' % color

s = corr_table.style.applymap(corr_color).set_caption('Korelacje między danymi pracowników')

s

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
Age,1.0,-0.159205,-0.011807,0.010661,0.031882,-0.001686,0.208034,-0.010145,0.010146,0.024287,0.02982,0.509604,-0.004892,0.497855,0.028051,0.299635,0.028062,0.003634,0.001904,0.053535,0.03751,0.680381,-0.019621,-0.02149,0.311309,0.212901,0.216513,0.202089
Attrition,-0.159205,1.0,0.127006,-0.056652,-0.063991,0.077924,-0.031373,-0.010577,-0.103369,-0.006846,-0.130016,-0.169105,-0.103481,-0.15984,0.01517,0.043494,0.246118,-0.013478,0.002889,-0.045872,-0.137145,-0.171063,-0.059478,-0.063939,-0.134392,-0.160545,-0.033019,-0.156199
BusinessTravel,-0.011807,0.127006,1.0,-0.015539,0.00264,-0.009696,-0.00867,-0.018538,-0.01131,-0.004164,0.0293,-0.011696,0.008666,-0.01345,-0.00844,-0.030743,0.042752,-0.025727,0.001683,0.008926,-0.028257,0.007972,0.016357,0.004209,0.005212,-0.005336,0.005222,-0.000229
DailyRate,0.010661,-0.056652,-0.015539,1.0,-0.007109,-0.004985,-0.016806,-0.05099,0.018355,0.023381,0.046135,0.002966,0.030571,0.007707,-0.032182,0.038153,0.009135,0.022704,0.000473,0.007846,0.042143,0.014515,0.002453,-0.037848,-0.034055,0.009932,-0.033229,-0.026363
Department,0.031882,-0.063991,0.00264,-0.007109,1.0,-0.017225,-0.007996,0.010895,0.019395,0.004144,0.024586,-0.101963,-0.021001,-0.05313,-0.023642,0.035882,-0.007481,0.00784,0.024604,0.022414,0.012193,0.015762,-0.036875,-0.026383,-0.02292,-0.056315,-0.040061,-0.034282
DistanceFromHome,-0.001686,0.077924,-0.009696,-0.004985,-0.017225,1.0,0.021042,0.032916,-0.016075,0.031131,0.008783,0.005303,-0.003669,-0.017014,0.027473,-0.029251,0.025514,0.040235,0.02711,0.006557,0.044872,0.004628,-0.036942,-0.026556,0.009508,0.018845,0.010029,0.014406
Education,0.208034,-0.031373,-0.00867,-0.016806,-0.007996,0.021042,1.0,0.04207,-0.027128,0.016775,0.042438,0.101589,-0.011296,0.094961,-0.026084,0.126317,-0.020322,-0.011111,-0.024539,-0.009118,0.018422,0.14828,-0.0251,0.009819,0.069114,0.060236,0.054254,0.069065
EmployeeNumber,-0.010145,-0.010577,-0.018538,-0.05099,0.010895,0.032916,0.04207,1.0,0.017621,0.035179,-0.006888,-0.018519,-0.046247,-0.014829,0.012648,-0.001251,-0.024037,-0.012944,-0.020359,-0.069861,0.062227,-0.014365,0.023603,0.010309,-0.01124,-0.008416,-0.009019,-0.009197
EnvironmentSatisfaction,0.010146,-0.103369,-0.01131,0.018355,0.019395,-0.016075,-0.027128,0.017621,1.0,-0.049857,-0.008278,0.001212,-0.006784,-0.006259,0.0376,0.012594,0.070132,-0.031701,-0.029548,0.007665,0.003432,-0.002693,-0.019359,0.027627,0.001458,0.018007,0.016194,-0.004999
HourlyRate,0.024287,-0.006846,-0.004164,0.023381,0.004144,0.031131,0.016775,0.035179,-0.049857,1.0,0.042861,-0.027853,-0.071335,-0.015794,-0.015297,0.022157,-0.007782,-0.009062,-0.002172,0.00133,0.050263,-0.002334,-0.008548,-0.004607,-0.019582,-0.024106,-0.026716,-0.020123
