WARNING!!

You will see a lot of bar plots. Two reasons:

 1. I am a big fan of bar plots.
 2. I am exploring ggplot2  for making bar plots

In [None]:
# This R environment comes with all of CRAN preinstalled, as well as many other helpful packages
# The environment is defined by the kaggle/rstats docker image: https://github.com/kaggle/docker-rstats
# For example, here's several helpful packages to load in 

library(ggplot2) # Data visualization
library(reshape)
library(readr) # CSV file I/O, e.g. the read_csv function

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

system("ls ../input")

# Any results you write to the current directory are saved as output.

In [None]:
df <- read.csv("../input/HR_comma_sep.csv")
# Look at the data
head(df)
summary(df)
str(df)
sum(is.na(df))

In [None]:
# Thankfully there is no missing data, we can continue with our exploration
# Create some plots to understand the data better

In [None]:
# Group column 'left' by 'sales' and find the percentage of people belonging to each group
plot1 <- aggregate(list(Percentage_left=df$left), list(Department=df$sales), function (x) sum(x)/sum(df$left)*100)
ggplot(plot1, aes(Department, Percentage_left, fill = Department)) + geom_bar(stat="identity") + 
		theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

In [None]:
# We see that the highest number of people who left belong to Sales department (28.3%)
# followed by technical (19.5%) and support (15.5%)

sum(df$left[df$sales=='sales'])/length(df$left[df$sales=='sales'])*100
sum(df$left[df$sales=='technical'])/length(df$left[df$sales=='technical'])*100
sum(df$left[df$sales=='support'])/length(df$left[df$sales=='support'])*100

In [None]:
# 24.5% from sales, 25.6% from techical and 24.9% from support left.
# let's explore other variables too
# Group column 'left' by 'salary' and find the percentage of people belonging to each group
plot2 <- aggregate(list(Percentage_left=df$left), list(Salary=df$salary), function (x) sum(x)/sum(df$left)*100)
ggplot(plot2, aes(Salary, Percentage_left, fill = Salary)) + geom_bar(stat="identity") + 
		theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

In [None]:
# 60.8% of people who left had low salary

sum(df$left[df$salary=='low'])/length(df$left[df$salary=='low'])*100

In [None]:
# 29.7% of people who have low salary, left
# Low salary could be one of the reasons why people are leaving

In [None]:
# Group column 'left' by 'number_projects' and find the percentage of people belonging to each group
plot3 <- aggregate(list(Percentage_left=df$left), list(Projects=df$number_project), function (x) sum(x)/sum(df$left)*100)
ggplot(plot3, aes(Projects, Percentage_left, fill = factor(Projects))) + geom_bar(stat="identity") + 
		labs(fill="Projects")  + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

In [None]:
# 43.9% of people left after just two projects, why is that? Lets look at another plot to get more insights.
plot4 <- data.frame(table(df$left, df$number_project))
ggplot(plot4, aes(Var2, Freq, fill=Var1)) + geom_bar(stat="identity") + xlab("No. of projects") +
ylab("No. of people") + labs(fill="Left") + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

In [None]:
# From the above graph, looks like chances of people staying with 2 projects is slim, 
# but what is happening at the last column? All the people who were working on 7 projects have left.

In [None]:
plot5 <- data.frame(table(df$left, df$time_spend_company))
ggplot(plot5, aes(Var2, Freq, fill=Var1)) + geom_bar(stat="identity") + xlab("Time") +ylab("No. of people") + labs(fill="Left")

In [None]:
# People who have spent a considerable amount of time have not left, as have the recent joinees.
# Enough of bar plots.... let us make some scatter plots!
ggplot(df, aes(factor(time_spend_company), average_montly_hours)) +
		geom_point(aes(color=factor(df$left)), position = "jitter") + 
		xlab("Time") + ylab("Average_monthly_hrs") + labs(color="Left")

In [None]:
ggplot(df, aes(factor(number_project), average_montly_hours)) + 
		geom_point(aes(color=factor(df$left)), position = "jitter") + 
		xlab("Projects") + ylab("Average_monthly_hrs") + labs(color="Left")

In [None]:
ggplot(df, aes(satisfaction_level, average_montly_hours)) +
		geom_point(aes(color=factor(df$left)), position = 'jitter') + 
		xlab("Satisfaction_level") + ylab("Average_monthly_hrs") + labs(color="Left")

In [None]:
ggplot(df, aes(factor(number_project), satisfaction_level)) +
		geom_point(aes(color=factor(df$left)), position = 'jitter') + 
		xlab("Projects") + ylab("Satisfaction_level") + labs(color="Left")

In [None]:
ggplot(df, aes(last_evaluation, average_montly_hours)) +
		geom_point(aes(color=factor(df$left)), position = 'jitter') + 
		xlab("last_evaluation") + ylab("Average_monthly_hrs") + labs(color="Left")

In [None]:
# Good news... No more plots!
# We can conclude that employees that are overworked and paid less will leave even after high evaluation.
# However there is a cluster of employees who are not overworked, are leaving as well (or were they fired?).
# The reason for this cannot be determined by existing data.