In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

Посмотрим на данные

In [None]:
data = pd.read_csv("./employee_leave_train.csv").drop('Unnamed: 0',axis=1)
data.head()

Unnamed: 0,Образование,Год начала работы,Город,Уровень оплаты,Возраст,Пол,Отстранения,Опыт,Увольнение
0,PHD,2013,New Delhi,3,30,Male,No,5,0
1,Bachelors,2018,Bangalore,3,25,Male,No,3,1
2,Masters,2017,New Delhi,3,26,Male,No,4,1
3,PHD,2012,Bangalore,1,38,Male,No,5,0
4,PHD,2018,New Delhi,3,27,Female,No,5,1


Посмотрим на признаки

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4153 entries, 0 to 4152
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Образование        4153 non-null   object
 1   Год начала работы  4153 non-null   int64 
 2   Город              4153 non-null   object
 3   Уровень оплаты     4153 non-null   int64 
 4   Возраст            4153 non-null   int64 
 5   Пол                4153 non-null   object
 6   Отстранения        4153 non-null   int64 
 7   Опыт               4153 non-null   int64 
 8   Увольнение         4153 non-null   int64 
dtypes: int64(6), object(3)
memory usage: 292.1+ KB


In [None]:
categorial_columns = [x for x in data.columns if data[x].dtype =='object']

In [None]:
categorial_columns

['Образование', 'Город', 'Пол']

In [None]:
data['Образование'].value_counts()

Bachelors    3204
Masters       791
PHD           158
Name: Образование, dtype: int64

In [None]:
data['Город'].value_counts()

Bangalore    1969
Pune         1130
New Delhi    1054
Name: Город, dtype: int64

In [None]:
data['Пол'].value_counts()

Male      2481
Female    1672
Name: Пол, dtype: int64

In [None]:
data['Отстранения'].value_counts()

0    3721
1     432
Name: Отстранения, dtype: int64

Переведем признак "отстранения" в вид бинарных переменных

In [None]:
data['Отстранения'] = data['Отстранения'].apply(lambda x: 1 if x=='Yes' else 0)
data.head()

Unnamed: 0,Образование,Год начала работы,Город,Уровень оплаты,Возраст,Пол,Отстранения,Опыт,Увольнение
0,PHD,2013,New Delhi,3,30,Male,0,5,0
1,Bachelors,2018,Bangalore,3,25,Male,0,3,1
2,Masters,2017,New Delhi,3,26,Male,0,4,1
3,PHD,2012,Bangalore,1,38,Male,0,5,0
4,PHD,2018,New Delhi,3,27,Female,0,5,1


In [None]:
categorial_columns = [x for x in data.columns if data[x].dtype =='object']
categorial_columns

['Образование', 'Город', 'Пол']

Бинаризуем оставшиеся категориальные признаки

In [None]:
data = pd.get_dummies(data, categorial_columns)
data.head()

Unnamed: 0,Год начала работы,Уровень оплаты,Возраст,Отстранения,Опыт,Увольнение,Образование_Bachelors,Образование_Masters,Образование_PHD,Город_Bangalore,Город_New Delhi,Город_Pune,Пол_Female,Пол_Male
0,2013,3,30,0,5,0,0,0,1,0,1,0,0,1
1,2018,3,25,0,3,1,1,0,0,1,0,0,0,1
2,2017,3,26,0,4,1,0,1,0,0,1,0,0,1
3,2012,1,38,0,5,0,0,0,1,1,0,0,0,1
4,2018,3,27,0,5,1,0,0,1,0,1,0,1,0


Поделим данные на признаки(X) и целевую переменную(y)

In [None]:
y = data['Увольнение']
X = data.drop('Увольнение', axis=1)

Строим модель

In [None]:
from sklearn import ensemble #, classifier_model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [None]:
rf = ensemble.RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=7)

In [None]:
rf.fit(X,y)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=5,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=10,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

Предобрабатываем тестовые данные

In [None]:
test_data = pd.read_csv('./employee_leave_test.csv').drop('Unnamed: 0', axis=1)
test_data.head()

Unnamed: 0,Образование,Год начала работы,Город,Уровень оплаты,Возраст,Пол,Отстранения,Опыт
0,Masters,2017,New Delhi,2,36,Male,Yes,2
1,Masters,2015,Bangalore,3,27,Female,No,5
2,Masters,2017,New Delhi,2,33,Male,No,2
3,Bachelors,2015,Bangalore,1,25,Female,Yes,3
4,Bachelors,2015,Pune,3,24,Female,No,2


In [None]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Образование        500 non-null    object
 1   Год начала работы  500 non-null    int64 
 2   Город              500 non-null    object
 3   Уровень оплаты     500 non-null    int64 
 4   Возраст            500 non-null    int64 
 5   Пол                500 non-null    object
 6   Отстранения        500 non-null    object
 7   Опыт               500 non-null    int64 
dtypes: int64(4), object(4)
memory usage: 31.4+ KB


In [None]:
test_data['Образование'].value_counts()

Bachelors    397
Masters       82
PHD           21
Name: Образование, dtype: int64

In [None]:
test_data['Город'].value_counts()

Bangalore    259
Pune         138
New Delhi    103
Name: Город, dtype: int64

In [None]:
test_data['Отстранения'].value_counts()

No     454
Yes     46
Name: Отстранения, dtype: int64

In [None]:
test_data['Пол'].value_counts()

Male      297
Female    203
Name: Пол, dtype: int64

In [None]:
test_data['Отстранения'] = test_data['Отстранения'].apply(lambda x: 1 if x=='Yes' else 0)
test_data.head()

Unnamed: 0,Образование,Год начала работы,Город,Уровень оплаты,Возраст,Пол,Отстранения,Опыт
0,Masters,2017,New Delhi,2,36,Male,1,2
1,Masters,2015,Bangalore,3,27,Female,0,5
2,Masters,2017,New Delhi,2,33,Male,0,2
3,Bachelors,2015,Bangalore,1,25,Female,1,3
4,Bachelors,2015,Pune,3,24,Female,0,2


In [None]:
categorial_columns = [x for x in test_data.columns if test_data[x].dtype =='object']
categorial_columns

['Образование', 'Город', 'Пол']

In [None]:
test_data = pd.get_dummies(test_data, categorial_columns)
test_data.head()

Unnamed: 0,Год начала работы,Уровень оплаты,Возраст,Отстранения,Опыт,Образование_Bachelors,Образование_Masters,Образование_PHD,Город_Bangalore,Город_New Delhi,Город_Pune,Пол_Female,Пол_Male
0,2017,2,36,1,2,0,1,0,0,1,0,0,1
1,2015,3,27,0,5,0,1,0,1,0,0,1,0
2,2017,2,33,0,2,0,1,0,0,1,0,0,1
3,2015,1,25,1,3,1,0,0,1,0,0,1,0
4,2015,3,24,0,2,1,0,0,0,0,1,1,0


In [None]:
data.shape, test_data.shape

((4153, 9), (500, 8))

Получаем предсказания моделей

In [None]:
y_test_rf = rf.predict(test_data)

In [None]:
y_test_rf

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,

In [None]:
df = pd.DataFrame(y_test_rf)
df.to_csv('test_ans.csv')

In [None]:
import csv

In [None]:
with open('outputFile1.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    for row in range(0,y_test_gb.shape[0]):
        myList = []
        myList.append(y_test_gb[row])
        writer.writerow(myList)

In [None]:
data1 = pd.read_csv("./outputFile.csv")
data1.head()

Unnamed: 0,0
0,1
1,0
2,0
3,1
4,0


In [None]:
data1.to_csv('answers.csv')

In [None]:
data2 = pd.read_csv("./answers.csv")
data2.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,0
1,1,1
2,2,0
3,3,0
4,4,1


#Пробуем CatBoost#

In [None]:
pip install catboost

Collecting catboost
  Downloading catboost-1.0.0-cp37-none-manylinux1_x86_64.whl (76.4 MB)
[K     |████████████████████████████████| 76.4 MB 22 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.0


In [None]:
from catboost import CatBoostClassifier, Pool

In [None]:
model = CatBoostClassifier(iterations=500,
                           depth=3,
                           learning_rate=0.1,
                           loss_function='Logloss',
                           cat_features=categorial_columns,
                           verbose=True)
# train model
model.fit(X,y)

0:	learn: 0.6428059	total: 4.47ms	remaining: 2.23s
1:	learn: 0.6058441	total: 8.65ms	remaining: 2.15s
2:	learn: 0.5781230	total: 12.8ms	remaining: 2.12s
3:	learn: 0.5538948	total: 17.3ms	remaining: 2.15s
4:	learn: 0.5293932	total: 21.2ms	remaining: 2.09s
5:	learn: 0.5149518	total: 24.9ms	remaining: 2.05s
6:	learn: 0.5047294	total: 28.4ms	remaining: 2s
7:	learn: 0.4947730	total: 32ms	remaining: 1.97s
8:	learn: 0.4808458	total: 35.5ms	remaining: 1.94s
9:	learn: 0.4703473	total: 39.2ms	remaining: 1.92s
10:	learn: 0.4596403	total: 42.6ms	remaining: 1.89s
11:	learn: 0.4547651	total: 46.7ms	remaining: 1.9s
12:	learn: 0.4506204	total: 50ms	remaining: 1.87s
13:	learn: 0.4453996	total: 53.6ms	remaining: 1.86s
14:	learn: 0.4400621	total: 57.3ms	remaining: 1.85s
15:	learn: 0.4340047	total: 60.6ms	remaining: 1.83s
16:	learn: 0.4305367	total: 64.1ms	remaining: 1.82s
17:	learn: 0.4266538	total: 67.5ms	remaining: 1.81s
18:	learn: 0.4247521	total: 71.3ms	remaining: 1.8s
19:	learn: 0.4217338	total: 75m

<catboost.core.CatBoostClassifier at 0x7fb461019250>

In [None]:
catboost_ans = model.predict(test_data)

In [None]:
df = pd.DataFrame(catboost_ans)
df.to_csv('catboost_ans.csv')

In [None]:
with open('catboost_answers.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    for row in range(0,catboost_ans.shape[0]):
        myList = []
        myList.append(catboost_ans[row])
        writer.writerow(myList)

In [None]:
data1 = pd.read_csv("./catboost_answers.csv")
data1.head()

Unnamed: 0,0
0,1
1,0
2,0
3,1
4,0
