# EEMP - Python Quiz

1. Import the following packages and alias them as we did before: 
    - *numpy, pandas, matplotlib, seaborn, statsmodels*

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

2. Read in the employee dataset, we were using before, saving as *employee_data*.
    - Show the first 20 rows of the dataset. 
    - Before that, set the maximum number of columns to be displayed to 200 
        - Hint: pd.set_option('display.max_columns', #cols)

In [None]:
path_to_data = "https://raw.githubusercontent.com/lemepe/EEMP/master/python_intro/Employee_data.csv"
employee_data=pd.read_csv(path_to_data)

pd.set_option('display.max_columns', 200)
employee_data.head(20)

3. Create a new dataset consisting of only employees that work in the "Research & Development" department, saving it as *employee_data_RD*.
    - Print the shape, i.e. number of rows and columns, of this new dataset.
    - Check that all observations of employees in this dataset are indeed working in the R&D department.
        - Hint: using the *value_counts()* method is one way to check this.

In [None]:
employee_data_RD = employee_data[employee_data['Department']=="Research & Development"]

print(employee_data_RD.shape)

employee_data_RD.Department.value_counts()

4. Create a new variable *dummy_retained*, which indicates whether the employee is still with the employer.
    - Hint: Attrition == 'No

In [None]:
employee_data_RD['dummy_retained'] = 0
employee_data_RD.loc[(employee_data_RD['Attrition']=='No'),('dummy_retained')]=1

employee_data_RD.shape # check the new shape (+1 column)

5. Check the descriptive statistics of the *employee_data_RD* dataset.
    - What is the min, max and mean monthly income of employees within this dataset?
    - What is the mean job satisfaction and what is its standard deviation?
    - How many employees within this dataset have left their employer?

In [None]:
employee_data_RD.describe()

# Answer 1: min: 1009, max: 19999, mean: 6281
# Answer 2: mean: 2.73, std: 1.1 (range 1-4)

In [None]:
employee_data_RD['dummy_retained'].value_counts() 

# Answer 3: 133 employees have left their employer

4. Visualize
    - The distribution of job satisfaction within the dataset.
    - The distribution of job satisfaction within the dataset, separately for those who work overtime and thos who do not
        - Hint: *OverTime == 'Yes"*
    - The correlation between job satisfaction and monthly income (also including a regression line).

In [None]:
# Distribution of job satisfaction
# with matplotlib
plt.hist(employee_data_RD['JobSatisfaction'],bins=[0.5,1.5,2.5,3.5,4.5], rwidth = 0.8, align='mid', density=True, alpha=0.5)
plt.xlabel('Job Satisfaction')
plt.xticks([1,2,3,4])

In [None]:
# with seaborn
sns.distplot(employee_data_RD['JobSatisfaction'],axlabel='Job Satisfaction')

In [None]:
# Distribution of job satisfaction, separately with/without overtime
# with matplotlib
plt.hist(employee_data_RD.loc[employee_data_RD['OverTime']=='No','JobSatisfaction'],bins=[0.5,1.5,2.5,3.5,4.5], rwidth = 0.5, align='mid', density=True, alpha=0.5)
plt.hist(employee_data_RD.loc[employee_data_RD['OverTime']=='Yes','JobSatisfaction'],bins=[0.75,1.75,2.75,3.75,4.75], rwidth = 0.5, align='mid', density=True, alpha=0.5)
plt.xlabel('Job Satisfaction')
plt.xticks([1,2,3,4])

In [None]:
# with seaborn
sns.distplot(employee_data[employee_data['OverTime']=='No']['JobSatisfaction'])
sns.distplot(employee_data[employee_data['OverTime']=='Yes']['JobSatisfaction'],axlabel='Job satisfaction')

In [None]:
# Correlation job satisfaction and monthly income

sns.regplot(x='JobSatisfaction',y='MonthlyIncome',data=employee_data_RD)

5. Run a regression with your new variable *dummy_retained* as the response variable and your choice of potentially meaningful input variables (e.g. monthly income, overtime, job satisfaction etc.) to predict whether an employee will stay with her employer.
    - Hint: Use *smf.logit()* as the response variable is discrete.

In [None]:
# Logit regression to predict whether an employee stays
results_logit = smf.logit('dummy_retained ~ MonthlyIncome + C(JobSatisfaction) + C(WorkLifeBalance) + C(OverTime) + TrainingTimesLastYear + C(MaritalStatus) + C(Gender)', data = employee_data_RD).fit()

print(results_logit.summary())