In [None]:
#https://colab.research.google.com/github/adelnehme/intro-to-data-visualization-Python-live-training/blob/master/Introduction_to_data_visualization_in_Python_live_session_full.ipynb#scrollTo=66IR-NarBKbu
# Importing relevant packages with their ali
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

In [None]:
employee_churn = pd.read_csv('https://github.com/adelnehme/intro-to-data-visualization-Python-live-training/blob/master/employee_churned_trimmed.csv?raw=true', index_col = "Unnamed: 0")

In [None]:
# Set initial plot options
sns.set_style('white')
plt.figure(figsize=(7,5))

# Create plot
sns.distplot(employee_churn['Age'],
             # Set color of density plot 
             color = "orange", 
             # Shade it for better aesthetics
             kde_kws = {"shade":True}, 
             # Remove historgram
             hist = False)

# Add vertical line
plt.axvline(employee_churn['Age'].median(),
            # Set upper limit for vertical line 
            ymax = 0.95, 
            # Set linestyle to "--"
            linestyle = "--")

# Add median age of all employees
plt.text(employee_churn['Age'].median() + 2, 0.04,
         # Create text 
         'Median Age: ' + str(employee_churn['Age'].median()), 
         # Set text style
         fontsize = 10, 
         style = "italic")

# Despine plot
sns.despine(left = True)
# Add finishing touches
plt.yticks([])
plt.xlabel('Employee Age', fontsize = 12, fontweight = "semibold")
plt.title('Age Distribution of Employees', fontsize = 14, fontweight = "semibold")
plt.show()

In [None]:
# Subset on males and compute their median age
male_churn = employee_churn[employee_churn['Gender'] == "Male"]
median_male_age = male_churn['Age'].median()

# Subset on females and compute their median age
female_churn = employee_churn[employee_churn['Gender'] == "Female"]
median_female_age = female_churn['Age'].median()

In [None]:
# Set initial plot options
sns.set_style('white')

# Create figure and axes
fig, axes = plt.subplots(2, 1, figsize=(10, 8))

# Create plot of male age distribution on first axes
sns.distplot(male_churn['Age'],
             color = "skyblue",
             kde_kws = {"shade":True},
             hist = False, ax = axes[0])

# Add vertical line and annotate median age for male employees on first axes
axes[0].axvline(median_male_age, ymax = 0.95, linestyle = "--")
axes[0].text(median_male_age + 3, 0.04, 'Median Age: ' + str(median_male_age) , fontsize = 10, style = "italic")
# Add number of male employees for context
axes[0].text(10, 0.04, str(male_churn.shape[0]) + " employees" , fontsize = 10)
# Format subplot in axes
axes[0].set_yticks([])
axes[0].set_xlabel("Male Employees", fontsize = 12, fontweight = "semibold")


# Create plot of female age distribution on second axes
sns.distplot(male_churn['Age'],
             color = "r",
             kde_kws = {"shade":True},
             hist = False, ax = axes[1])

# Add vertical line and annotate median age for female employees on first axes
axes[1].axvline(median_female_age, ymax = 0.95, linestyle = "--")
axes[1].text(median_female_age + 3, 0.04, 'Median Age: ' + str(median_female_age) , fontsize = 10, style = "italic")
# Add number of female employees for context
axes[1].text(10, 0.04, str(female_churn.shape[0]) + " employees" , fontsize = 10)
axes[1].set_yticks([])
axes[1].set_xlabel("Female Employees", fontsize = 12, fontweight = "semibold")

# Despine visualizations
sns.despine(left = True)
# Figure final formatting   
fig.suptitle('Age Distribution by gender', fontsize = 14, fontweight = "semibold")
plt.show()

In [None]:
# Set initial plot options
sns.set_style('white')
plt.figure(figsize = (12, 8))

# Create a countplot
sns.countplot(x='Education',
              # Group by Gender 
              hue = 'Gender', 
              data=employee_churn,
              order=['Below College','College',"Bachelor's degree","Master's degree",'PhD'],
              # Set colour of hue and transperency
              palette = ["r","skyblue"],
              alpha = 0.6)

# Despine visualizations
sns.despine()
# Final styling touche
plt.ylabel("Number of employees", fontsize = 12, fontweight = "semibold")
plt.xlabel("Education", fontsize = 12, fontweight = "semibold")
plt.title("Distribution of Education Levels by Gender", fontweight = "semibold", fontsize = 14)
plt.show()

In [None]:
# A note on zip()
first_names = ['Adel', 'Sara', 'Lis']
last_names = ['Nehme', 'Billen', 'Sulmont']

# Zip iterables
for first, last in zip(first_names, last_names):
  print(first, last.upper())

In [None]:
# Set initial plot options
sns.set_style('white')

# Create helper lists
columns_to_plot = ['Age', "DistanceFromHome"]
titles_to_plot = ["Age", "Distance From Home (km)"]

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(22, 8))

# Iterate over each axes, and plot a boxplot with relevant columns/titles
for ax, column, title in zip(axes, columns_to_plot, titles_to_plot):
  
  # Create a boxplot
  sns.boxplot(x = "Attrition", 
              # Plot each column from columns_to_plot list
              y = column, 
              data = employee_churn,
              # Set the order for x-axis categories
              order = ["Churned", "Stayed"],
              # Change the width and line-width of each box-plot
              width = 0.4, 
              linewidth = 1.75,
              # Fix colors of churned (red) vs stayed (green)
              palette = ["r", "g"], 
              # Assign boxplot to an axes
              ax = ax)
  
  # Despine plot
  sns.despine()
  # Fix final styling for each axes
  ax.set_xlabel("Attrition", fontsize = 12, fontweight = "semibold")
  ax.set_ylabel(title, fontsize = 12, fontweight = "semibold")
  ax.set_title("Employee churn by " + title, fontweight = "semibold", fontsize = 14)


plt.show()

In [None]:
# Compute proportion of RelationshipSatisfaction by Attrition categories
relationship_attrition = pd.crosstab(employee_churn['RelationshipSatisfaction'], 
                                     employee_churn['Attrition'], 
                                     # Find proportions across rows
                                     normalize = "index").reset_index()

# Unify proportions as percentages
relationship_attrition[['Churned', 'Stayed']] = relationship_attrition[['Churned', 'Stayed']] * 100
relationship_attrition['Total'] = relationship_attrition['Churned'] + relationship_attrition['Stayed']

# See changes
relationship_attrition

In [None]:
# Set initial plot options
sns.set_style('white')
plt.figure(figsize = (10,5))

# Plot the total percentage of employees first (i.e. 100%)
sns.barplot(x = "Total", 
            y = "RelationshipSatisfaction", 
            data = relationship_attrition, 
            order = ['Low', 'Good', 'Great', 'Excellent'], 
            color= "g")

# Plot the percentage of employees who churned 
sns.barplot(x = "Churned", 
            y = "RelationshipSatisfaction" , 
            data = relationship_attrition, 
            order = ['Low', 'Good', 'Great', 'Excellent'], 
            color= "r")

# Despine plot
sns.despine(left = True, bottom = True)
# Final formatting changes
plt.xlim(0,100)
plt.xlabel("Attrition Rate", fontweight = "semibold", fontsize = 12)
plt.ylabel("Relationship Satisfaction", fontweight = "semibold", fontsize = 12)
plt.title("Thoughts on Relationship Satisfaction based on Churn", fontweight = "semibold", fontsize = 14)

plt.show()

In [None]:
# Set initial plot options
sns.set_style('white')

# Create helper lists
survey_columns = ["EnvironmentSatisfaction", "JobSatisfaction", "JobInvolvement", "WorkLifeBalance"]
survey_titles = ["Environment Satisfaction", "Job Satisfaction", "Job Invovlvement", "Work-life Balance"]

# Create subplots - 2 rows, 2 columns
fig, axes = plt.subplots(2, 2, figsize=(24, 16))

# For every element
for ax, survey_question, survey_title in zip(axes.flatten(), survey_columns, survey_titles):
  
  # Cross tabulate data and generate proportions for ecah survey question
  proportions = pd.crosstab(employee_churn[survey_question], 
                            employee_churn['Attrition'], 
                            # Find proportions across rows
                            normalize = "index").reset_index()
  
  # Find percentages instead of proportions
  proportions[['Churned', 'Stayed']] = proportions[['Churned', 'Stayed']] * 100 
  
  # Get total column for each survey question
  proportions['Total'] = proportions['Churned'] + proportions['Stayed']
  
  # Plot Total plot
  sns.barplot(x = "Total", 
              y =  survey_question, 
              data = proportions, 
              order = ['Low', 'Good', 'Great', 'Excellent'], 
              color= "g",
              ax = ax)

  # Plot Churned plot
  sns.barplot(x = "Churned", 
              y = survey_question, 
              data = proportions, 
              order = ['Low', 'Good', 'Great', 'Excellent'], 
              color= "r",
              ax = ax)
  
  # Despine plot
  sns.despine(left = True, bottom = True)
  
  # Set final formatting
  ax.set_title("Attrition by " + survey_title, fontweight = "semibold", fontsize = 14)
  ax.set_ylabel(survey_title, fontsize = 12, fontweight = "semibold")
  ax.set_xlabel("Attrition Rate (%)", fontsize = 12, fontweight = "semibold")

In [None]:
# Set initial plot options
sns.set_style('white')
plt.figure(figsize = (14, 8))

# Create scatterplot
sns.scatterplot(x = "Age", 
                y = "MonthlyIncome", 
                # Group by and change dot style and  by Attrition
                hue = "Attrition",
                size = "Attrition",  
                style = "Attrition", 
                data = employee_churn, 
                # Change color of hue categories
                palette = ["r", "g"],
                alpha = 0.6)

# Despine plot
sns.despine()
# Final formatting touches
plt.xlabel("Age", fontsize = 12, fontweight = "semibold")
plt.ylabel("Monthly Income ($)", fontsize = 12, fontweight = "semibold")
plt.title("Monthly Income ($)", fontsize = 14, fontweight = "semibold")
plt.show()

In [None]:
# Create categories for raises
employee_churn['raise_category'] = pd.cut(employee_churn['PercentSalaryHike'],
                                          # Define cutoff points 10-15, 15-20, 20-25 
                                          [10, 15, 20, 25], 
                                          # Define category labels
                                          labels = ['10-15%', '15-20%', '20-25%']) 

In [None]:
# Set initial plot options
sns.set_style('white')
plt.figure(figsize = (14, 8))

# Create swarmplot
sns.swarmplot(x = "raise_category", 
              y = "MonthlyIncome", 
              # Group by Attrition
              hue = "Attrition",
              data = employee_churn,
              # Order categories
              order = ['10-15%', '15-20%', '20-25%'],
              # Change color of hue categories
              palette = ['r','g'], 
              # Change size and transparency of each dot
              size = 4, alpha = 0.8)
# Despine plot
sns.despine()
# Final formatting touches
plt.xlabel("Salary hike percentage (%)", fontsize = 12, fontweight = "semibold")
plt.ylabel("Monthly Income ($)", fontsize = 12, fontweight = "semibold")
plt.title("Monthly Income ($)", fontsize = 14, fontweight = "semibold")
plt.show()

In [None]:
# Set initial plot options
sns.set_style('white')
plt.figure(figsize = (18, 8))

# Create a lineplot
sns.lineplot('YearsAtCompany', 
             y = "MonthlyIncome", 
             # Set hue and linestyle by Attrition
             hue = "Attrition", 
             style = "Attrition", 
             data = employee_churn,
             # Change colors for each hue category
             palette = ["r","g"], 
             # Add points to lineplot and remove confidence interval
             markers = True, ci = None)

# Add text to elaborate on plot insights
plt.text(25, 7500, 
         # Add text and style it
         "At almost all levels of experience, monthly income is less for churners", 
         fontsize = 10, style='italic', 
         # Add red bounding box around text
         bbox={'facecolor': 'red', 'alpha': 0.5, 'pad': 10})


# Despine plot
sns.despine()
# Final formatting
plt.xlabel("Total Working Years at the Company", fontsize = 12, fontweight = "semibold")
plt.ylabel("Monthly Income ($)", fontsize = 12, fontweight = "semibold")
plt.title("Monthly Income by Years of Experience at Compaby", fontsize = 14, fontweight = "semibold")
plt.show()