In [None]:
import pandas as pd
from scipy.stats import pearsonr
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = [12, 8]

In [None]:
df = pd.read_csv('Part 2 - Turnover.csv')

## ex 8

In [None]:
df.loc[lambda x: x['left']=='yes', 'left'] = 1
df.loc[lambda x: x['left']=='no', 'left'] = 0
df.loc[lambda x: x['promotion']=='yes', 'promotion'] = 1
df.loc[lambda x: x['promotion']=='no', 'promotion'] = 0

### 8a

In [None]:
len(df.loc[lambda x: x['left']==1])

### 8b

In [None]:
corr, p = pearsonr(list(df['projects'].values), list(df['left'].values))
print(round(corr, 3))
print(round(p, 3))

### 8c

In [None]:
dfpromoleft = df.groupby(['promotion','left'], as_index=False)[['satisfaction']].count()
dfsubs = []
for y in dfpromoleft['promotion'].unique():
    dfpromoleftsub = dfpromoleft.loc[lambda x: x['promotion']==y].copy()
    dfpromoleftsub = dfpromoleftsub.rename(columns={'satisfaction': 'n_employees'})
    dfpromoleftsub['share'] = round(dfpromoleftsub['n_employees'] / dfpromoleftsub['n_employees'].sum(), 2)
    dfsubs.append(dfpromoleftsub)
dfpromoleftnew = pd.concat(dfsubs)
dfpromoleftnew

### 8d

In [None]:
dfdepartment = df.groupby(['department'], as_index=False).agg({
    'satisfaction': 'count',
    'left': 'sum',
})
dfdepartment = dfdepartment.rename(columns={'satisfaction': 'n_employees'})
dfdepartment['turnover'] = round(dfdepartment['left'] / dfdepartment['n_employees'], 4)
dfdepartment.sort_values('turnover', ascending=False)

## ex 9

### 9a

In [None]:
import random

In [None]:
mean = 0
stdev = 0.3/3
randomrounds1 = [random.gauss(mean, stdev) for _ in range(len(df))]
randomrounds2 = [random.gauss(mean, stdev) for _ in range(len(df))]
df['float1'] = randomrounds1
df['float2'] = randomrounds2
df['projects_rnd'] = df['projects'] + df['float1']
df['tenure_rnd'] = df['tenure'] + df['float2']
colors = {1:'red', 0:'blue'}
plt.scatter(x=df['tenure_rnd'], y=df['projects_rnd'], c=df['left'].apply(lambda x: colors[x]), marker='x', s=20, linewidth=1)

### 9b

In [None]:
mean = 0
stdev = 0.3/3
randomrounds1 = [random.gauss(mean, stdev) for _ in range(len(df))]
randomrounds2 = [random.gauss(mean, stdev) for _ in range(len(df))]
df['float1'] = randomrounds1
df['float2'] = randomrounds2
df['satisfaction_rnd'] = df['satisfaction'] + df['float1']
df['evaluation_rnd'] = df['evaluation'] + df['float2']
colors = {1:'red', 0:'blue'}
plt.scatter(x=df['satisfaction_rnd'], y=df['evaluation_rnd'], c=df['left'].apply(lambda x: colors[x]), marker='x', s=20, linewidth=1)

## ex 10

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix
plt.style.use("seaborn-dark")

In [None]:
def yesnostring_to_bool(dataf, columns=['accident', 'left', 'promotion']):
    for c in columns:
        dataf.loc[lambda x: x[c]=='yes', c] = 1
        dataf.loc[lambda x: x[c]=='no', c] = 0
        dataf[c] = dataf[c].astype(int)
    return dataf

def to_dummies(dataf, columns=['department', 'salary']):
    for c in columns:
        for i in dataf[c].unique():
            dataf[c+i] = 0
            dataf.loc[lambda x: x[c]==i, c+i] = 1
    return dataf.drop(columns=columns)

def to_categories(dataf, columns=['department', 'salary']):
    le = preprocessing.LabelEncoder()
    for c in columns:
        dataf[c] = le.fit_transform(dataf[c])
    return dataf

In [None]:
df = pd.read_csv('Part 2 - Turnover.csv')

### 10a-b

In [None]:
dfprepped = df.pipe(yesnostring_to_bool).pipe(to_dummies)
xcolumns = [c for c in dfprepped.columns if c != 'left']
X = dfprepped[xcolumns].values
y = dfprepped['left'].values
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
model = LogisticRegression(random_state=1, solver='sag', max_iter=10000).fit(X, y)
predictions = model.predict(X)

In [None]:
dfpredictions = pd.DataFrame({'true': y, 'pred': predictions})
dfpredictions['count'] = 1
dfpredresult = dfpredictions.groupby(['true', 'pred'], as_index=False)[['count']].sum()

In [None]:
print('-- matrix:')
metrics.confusion_matrix(y, predictions)

In [None]:
print('-- model score:')
print(f"n correct: {dfpredresult.loc[lambda x: x['true']==x['pred']]['count'].sum()}")
print(dfpredresult.loc[lambda x: x['true']==x['pred']]['count'].sum() / dfpredresult['count'].sum())
print(model.score(X, y))

### 11a-b

In [None]:
dfprepped = df.pipe(yesnostring_to_bool).pipe(to_categories)
xcolumns = [c for c in dfprepped.columns if c != 'left']
X = dfprepped[xcolumns].values
y = dfprepped['left'].values

In [None]:
model = DecisionTreeClassifier(max_depth=2).fit(X, y)
predictions = model.predict(X)

In [None]:
dfpred = pd.DataFrame({'true': y, 'pred': predictions})
dfpred['count'] = 1
dfpredresult = dfpred.groupby(['true', 'pred'], as_index=False)[['count']].sum()

In [None]:
print('-- matrix:')
metrics.confusion_matrix(y, predictions)

In [None]:
print('-- model score:')
print(f"n correct: {dfpredresult.loc[lambda x: x['true']==x['pred']]['count'].sum()}")
print(dfpredresult.loc[lambda x: x['true']==x['pred']]['count'].sum() / dfpredresult['count'].sum())
print(model.score(X, y))

In [None]:
dfpredresult

In [None]:
tree.plot_tree(model)
plt.show()