In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostClassifier

In [2]:
train = pd.read_csv('data/train.csv').rename({'DEBT': 'target'}, axis=1)
sample_sub = pd.read_csv('data/sample_submission.csv')
disc = pd.read_csv('data/comp_disc.csv')
marks = pd.read_csv('data/comp_marks.csv')
portrait = pd.read_csv('data/comp_portrait.csv')
students = pd.read_csv('data/comp_students.csv', date_parser='DATE_START')
teachers = pd.read_csv('data/comp_teachers.csv')
test = pd.read_csv('data/test.csv')

In [3]:
all_dicts = {}
all_dicts['student_id'] = LabelEncoder().fit(portrait['ISU'])
for x in [marks, train, students, portrait, test]:
    x['ISU'] = all_dicts['student_id'].transform(x['ISU'])
for x in [marks, train, students, portrait, test]:
    x.rename({'ISU': 'student_id'}, axis=1, inplace=True)

all_dicts['disc_id'] = LabelEncoder().fit(np.concatenate((teachers['DISC_ID'].unique(),disc['DISC_ID'].unique(), test['DISC_ID'].unique())))
for x in [train, disc, teachers, marks, test]:
    x['DISC_ID'] = all_dicts['disc_id'].transform(x['DISC_ID'])

all_dicts['plan'] = LabelEncoder().fit(np.concatenate((disc['PLAN_ID'].unique(),students['MAIN_PLAN'].unique())))
disc['PLAN_ID'] = all_dicts['plan'].transform(disc['PLAN_ID'])
for x in [marks, students, teachers]:
    x['MAIN_PLAN'] = all_dicts['plan'].transform(x['MAIN_PLAN'])

all_dicts['teacher'] = LabelEncoder().fit(np.concatenate((teachers['ISU'].unique(), marks['PRED_ID'].unique())))
teachers['ISU'] = all_dicts['teacher'].transform(teachers['ISU'])
marks['PRED_ID'] = all_dicts['teacher'].transform(marks['PRED_ID'])

all_dicts['type_name'] = LabelEncoder().fit(train['TYPE_NAME'])
train['TYPE_NAME'] = all_dicts['type_name'].transform(train['TYPE_NAME'])
test['TYPE_NAME'] = all_dicts['type_name'].transform(test['TYPE_NAME'])

all_dicts['gender'] = LabelEncoder().fit(portrait['GENDER'])
portrait['GENDER'] = all_dicts['gender'].transform(portrait['GENDER'])

all_dicts['exam_type'] = LabelEncoder().fit(portrait['EXAM_TYPE'])
portrait['EXAM_TYPE'] = all_dicts['exam_type'].transform(portrait['EXAM_TYPE'])

# train['DISC+SEMESTER'] = train['DISC_ID'].astype('str')+'_'+train['SEMESTER'].astype('str')
train['DISC+student'] = train['DISC_ID'].astype('str')+'_'+train['student_id'].astype('str')
test['DISC+student'] = test['DISC_ID'].astype('str')+'_'+test['student_id'].astype('str')

marks['MARK'] = marks['MARK'].replace({'зачет': '5', 'неявка': '2', 'незач': '2', 'осв': '3'})
train['MARK'] = train['MARK'].replace({'зачет': '5', 'неявка': '2', 'незач': '2', 'осв': '3'})
train.loc[~train['MARK'].isna(), 'MARK'] = train.loc[~train['MARK'].isna(), 'MARK'].astype('int')
# marks = marks.drop(['PRED_ID', 'MAIN_PLAN'], axis=1)
# train = train.drop(['TYPE_NAME'], axis=1)

In [4]:
# tmp = train.groupby('DISC_ID')['target'].mean().to_frame()
# hard_disc = tmp[tmp['target']>0].index.tolist()
# train = train[train['DISC_ID'].isin(hard_disc)]

In [5]:
# tmp = train.groupby('student_id')['target'].max().to_frame()
# need_students = tmp[tmp['target']==1].index.tolist()
# train = train[train['student_id'].isin(need_students)]

In [6]:
display('TRAIN', train.head())
display('DISC', disc.head())
display('MARKS', marks.head())
display('STUDENTS', students.head())
display('PORTRAIT', portrait.head())
display('TEACHERS', teachers.head())
display('TEST', test.head())

'TRAIN'

Unnamed: 0,student_id,ST_YEAR,SEMESTER,DISC_ID,TYPE_NAME,MARK,target,DISC+student
0,8197,2020,1,2659,1,5,0,2659_8197
1,8197,2020,1,400,1,5,0,400_8197
2,8197,2020,1,2406,1,5,0,2406_8197
3,8197,2020,1,2245,1,5,0,2245_8197
4,8197,2020,1,4523,3,3,0,4523_8197


'DISC'

Unnamed: 0,PLAN_ID,DISC_ID,СHOICE,SEMESTER,DISC_NAME,DISC_DEP,KEYWORD_NAMES
0,647,3190,1.0,6.0,Методы криптографии,8139328495281102994,"криптография, криптография, криптография, клас..."
1,647,2105,1.0,6.0,Анализ социальных сетей,8139328495281102994,"технологии работы информацией, алгоритмы обраб..."
2,647,2985,1.0,6.0,Эффективное участие в научных и практических к...,1687590892619214362,
3,647,4614,1.0,6.0,Управление карьерой,1687590892619214362,
4,647,4511,1.0,6.0,Обработка изображений,8139328495281102994,"классическое машинное обучение, генерация изоб..."


'MARKS'

Unnamed: 0,student_id,ST_YEAR,SEMESTER,TYPE_NAME,MARK,MAIN_PLAN,DISC_ID,PRED_ID
0,8197,2020,1,Зачет,5,234,2659,390
1,8197,2020,1,Зачет,5,234,400,390
2,8197,2020,1,Зачет,5,234,2406,390
3,8197,2020,1,Зачет,5,234,2245,390
4,8197,2020,1,Экзамен,3,234,4523,390


'STUDENTS'

Unnamed: 0,student_id,KURS,DATE_START,DATE_END,PRIZNAK,MAIN_PLAN
0,1854,2,2017-09-01,2018-08-31 00:00:00,обучен,82
1,1854,3,2018-09-01,2018-11-02 00:00:00,обучен,82
2,1854,3,2018-11-03,2019-08-31 00:00:00,обучен,82
3,1854,4,2019-09-01,2020-07-06 00:00:00,обучен,82
4,2727,2,2017-09-01,2018-08-31 00:00:00,обучен,101


'PORTRAIT'

Unnamed: 0,student_id,GENDER,CITIZENSHIP,EXAM_TYPE,EXAM_SUBJECT_1,EXAM_SUBJECT_2,EXAM_SUBJECT_3,ADMITTED_EXAM_1,ADMITTED_EXAM_2,ADMITTED_EXAM_3,ADMITTED_SUBJECT_PRIZE_LEVEL,REGION_ID
0,4024,0,15601729049989747827,1,70786669040476600,5533732657842394915,8388269026169219461,88.0,91.0,81.0,ЕГЭ,4877310761925081124
1,8506,1,15601729049989747827,0,70786669040476600,5533732657842394915,8388269026169219461,84.0,89.0,96.0,ЕГЭ,4877310761925081124
2,8187,1,15601729049989747827,0,70786669040476600,5533732657842394915,8388269026169219461,66.0,78.0,93.0,ЕГЭ,4877310761925081124
3,1854,1,15601729049989747827,0,70786669040476600,5533732657842394915,8388269026169219461,100.0,89.0,80.0,ЕГЭ,4877310761925081124
4,2727,1,15601729049989747827,0,70786669040476600,5533732657842394915,8388269026169219461,84.0,86.0,91.0,ЕГЭ,4877310761925081124


'TEACHERS'

Unnamed: 0,ISU,GENDER,DATE_BIRTH,ST_YEAR,SEMESTER,DISC_ID,MAIN_PLAN,TYPE_NAME,MARK
0,843,М,1975,2020/2021,4.0,3346,511,Дифференцированный зачет,3.471698
1,843,М,1975,2018/2019,2.0,2281,780,Дифференцированный зачет,4.243902
2,843,М,1975,2019/2020,4.0,1224,165,Экзамен,4.652174
3,843,М,1975,2018/2019,2.0,631,165,Дифференцированный зачет,4.266667
4,843,М,1975,2018/2019,2.0,631,165,Курсовой проект,4.266667


'TEST'

Unnamed: 0,student_id,ST_YEAR,DISC_ID,TYPE_NAME,DISC+student
0,8,2021,134,3,134_8
1,8,2021,1406,1,1406_8
2,8,2021,1911,1,1911_8
3,8,2021,2245,1,2245_8
4,8,2021,2279,1,2279_8


In [7]:
tmp = portrait[(portrait['ADMITTED_EXAM_1']>0)&(portrait['ADMITTED_EXAM_2']>0)&(portrait['ADMITTED_EXAM_3']>0)]
portrait.loc[(portrait['ADMITTED_EXAM_1']>0)&(portrait['ADMITTED_EXAM_2']>0)&(portrait['ADMITTED_EXAM_3']>0), 'SUM_EXAMS'] = tmp['ADMITTED_EXAM_1'] + tmp['ADMITTED_EXAM_2'] + tmp['ADMITTED_EXAM_3']
portrait.loc[(portrait['EXAM_TYPE']==2), 'SUM_EXAMS'] = 300
del tmp

In [8]:
students_info = train.groupby('student_id')[['target', 'MARK']].agg(['mean', 'count'])
students_info.columns = [f"student_{x[0]}_{x[1]}" for x in students_info.columns]

disc_info = train.groupby('DISC_ID')[['MARK', 'target']].agg(['mean'])
disc_info.columns = [f"disc_{x[0]}_{x[1]}" for x in disc_info.columns]

In [9]:
train = train.merge(disc_info, how='left', left_on='DISC_ID', right_index=True).merge(
                        students_info, how='left', left_on='student_id', right_index=True)

In [10]:
disc_bad_stud_info = train[train['target']==1].groupby('DISC_ID')[['student_target_mean', 'student_MARK_mean']].agg(['mean', 'count'])
disc_bad_stud_info.columns = [f"disc_bad_stud_{x[0]}_{x[1]}" for x in disc_bad_stud_info]
disc_good_stud_info = train[train['target']==0].groupby('DISC_ID')[['student_target_mean', 'student_MARK_mean']].agg(['mean', 'count'])
disc_good_stud_info.columns = [f"disc_good_stud_{x[0]}_{x[1]}" for x in disc_good_stud_info]

In [11]:
student_disc_bad_info = train[train['target']==1].groupby('student_id')[['disc_MARK_mean', 'disc_target_mean']].agg(['mean', 'count'])
student_disc_bad_info.columns = [f"student_disc_bad_info_{x[0]}_{x[1]}" for x in student_disc_bad_info.columns]
student_disc_good_info = train[train['target']==0].groupby('student_id')[['disc_MARK_mean', 'disc_target_mean']].agg(['mean', 'count'])
student_disc_good_info.columns = [f"student_disc_good_info_{x[0]}_{x[1]}" for x in student_disc_good_info.columns]

In [12]:
train = train.merge(disc_bad_stud_info, how='left', left_on='DISC_ID', right_index=True).merge(
                        disc_good_stud_info, how='left', left_on='DISC_ID', right_index=True).merge(
                            student_disc_bad_info, how='left', left_on='student_id', right_index=True).merge(
                                student_disc_good_info, how='left', left_on='student_id', right_index=True)

In [13]:
train_data = train.drop(['student_id', 'ST_YEAR', 'SEMESTER', 'DISC_ID', 'MARK', 'DISC+student'], axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(train_data.drop('target', axis=1), train_data['target'], test_size=0.2, random_state=42)

model = CatBoostClassifier(
    depth=4,
    iterations=3000,
    eval_metric='F1',
    random_seed=42
)
model.fit(Pool(X_train, y_train, cat_features=['TYPE_NAME']), eval_set=Pool(X_valid, y_valid, cat_features=['TYPE_NAME']), verbose=100)

Learning rate set to 0.066987
0:	learn: 0.4272879	test: 0.4251607	best: 0.4251607 (0)	total: 82.2ms	remaining: 4m 6s
100:	learn: 0.6852848	test: 0.6795367	best: 0.6807018 (99)	total: 2.16s	remaining: 1m 2s
200:	learn: 0.7204534	test: 0.7131091	best: 0.7134021 (199)	total: 4.12s	remaining: 57.3s
300:	learn: 0.7370245	test: 0.7216774	best: 0.7227857 (299)	total: 6.06s	remaining: 54.3s
400:	learn: 0.7494468	test: 0.7272727	best: 0.7281913 (395)	total: 7.89s	remaining: 51.1s
500:	learn: 0.7571976	test: 0.7280672	best: 0.7314286 (464)	total: 9.79s	remaining: 48.9s
600:	learn: 0.7618583	test: 0.7321488	best: 0.7326401 (556)	total: 11.6s	remaining: 46.5s
700:	learn: 0.7673099	test: 0.7314784	best: 0.7326401 (556)	total: 13.5s	remaining: 44.1s
800:	learn: 0.7730250	test: 0.7316583	best: 0.7329983 (785)	total: 15.3s	remaining: 41.9s
900:	learn: 0.7776789	test: 0.7328653	best: 0.7344482 (879)	total: 17.2s	remaining: 40.1s
1000:	learn: 0.7837162	test: 0.7344897	best: 0.7358239 (955)	total: 19.2s	

<catboost.core.CatBoostClassifier at 0x14b699820>

In [14]:
pd.DataFrame({'x': train_data.drop('target', axis=1).columns, 'y': model.get_feature_importance()}).sort_values('y', ascending=False)

Unnamed: 0,x,y
16,student_disc_bad_info_disc_MARK_mean_count,19.356334
9,disc_bad_stud_student_MARK_mean_mean,13.560696
2,disc_target_mean,9.242188
3,student_target_mean,9.111713
17,student_disc_bad_info_disc_target_mean_mean,8.424039
18,student_disc_bad_info_disc_target_mean_count,8.263108
1,disc_MARK_mean,7.191583
15,student_disc_bad_info_disc_MARK_mean_mean,5.213734
11,disc_good_stud_student_target_mean_mean,3.573455
0,TYPE_NAME,2.826322


In [15]:
test = test.merge(disc_info, how='left', left_on='DISC_ID', right_index=True).merge(
                        students_info, how='left', left_on='student_id', right_index=True)

In [16]:
test = test.merge(disc_bad_stud_info, how='left', left_on='DISC_ID', right_index=True).merge(
                        disc_good_stud_info, how='left', left_on='DISC_ID', right_index=True).merge(
                            student_disc_bad_info, how='left', left_on='student_id', right_index=True).merge(
                                student_disc_good_info, how='left', left_on='student_id', right_index=True)

In [18]:
test['predictions'] = model.predict_proba(test.drop(['student_id', 'ST_YEAR', 'DISC_ID', 'DISC+student'], axis=1))[:, 1]

In [19]:
test['predictions'] = test['predictions'].fillna(0)

In [24]:
test['final_predictions'] = (test['predictions'] > 0.03).astype('int')

In [25]:
test['final_predictions'].value_counts()

0    33824
1     2425
Name: final_predictions, dtype: int64

In [26]:
sample_sub['DEBT'] = test['final_predictions']
sample_sub.to_csv('submit.csv', index=False)

Должно быть приблизительно 2200 единиц

In [31]:
from sklearn.metrics import f1_score
import numpy as np

true = np.zeros((36249))
true[:2211] = 1
preds = np.zeros((36249))
pad=1560
preds[pad:pad+2425] = 1
f1_score(true, preds)

0.28084555651423637