In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostClassifier

In [2]:
train = pd.read_csv('data/train.csv').rename({'DEBT': 'target'}, axis=1)
sample_sub = pd.read_csv('data/sample_submission.csv')
disc = pd.read_csv('data/comp_disc.csv')
marks = pd.read_csv('data/comp_marks.csv')
portrait = pd.read_csv('data/comp_portrait.csv')
students = pd.read_csv('data/comp_students.csv', date_parser='DATE_START')
teachers = pd.read_csv('data/comp_teachers.csv')
test = pd.read_csv('data/test.csv')

In [3]:
all_dicts = {}
all_dicts['student_id'] = LabelEncoder().fit(portrait['ISU'])
for x in [marks, train, students, portrait, test]:
    x['ISU'] = all_dicts['student_id'].transform(x['ISU'])
# marks['ISU'] = all_dicts['student_id'].transform(marks['ISU'])
# train['ISU'] = all_dicts['student_id'].transform(train['ISU'])
# students['ISU'] = all_dicts['student_id'].transform(students['ISU'])
# portrait['ISU'] = all_dicts['student_id'].transform(portrait['ISU'])
# test['ISU'] = all_dicts['student_id'].transform(test['ISU'])
for x in [marks, train, students, portrait, test]:
    x.rename({'ISU': 'student_id'}, axis=1, inplace=True)

all_dicts['disc_id'] = LabelEncoder().fit(np.concatenate((teachers['DISC_ID'].unique(),disc['DISC_ID'].unique(), test['DISC_ID'].unique())))
for x in [train, disc, teachers, marks, test]:
    x['DISC_ID'] = all_dicts['disc_id'].transform(x['DISC_ID'])
# train['DISC_ID'] = all_dicts['disc_id'].transform(train['DISC_ID'])
# disc['DISC_ID'] = all_dicts['disc_id'].transform(disc['DISC_ID'])
# teachers['DISC_ID'] = all_dicts['disc_id'].transform(teachers['DISC_ID'])
# marks['DISC_ID'] = all_dicts['disc_id'].transform(marks['DISC_ID'])
# test['DISC_ID'] = all_dicts['disc_id'].transform(test['DISC_ID'])

all_dicts['plan'] = LabelEncoder().fit(np.concatenate((disc['PLAN_ID'].unique(),students['MAIN_PLAN'].unique())))
disc['PLAN_ID'] = all_dicts['plan'].transform(disc['PLAN_ID'])
for x in [marks, students, teachers]:
    x['MAIN_PLAN'] = all_dicts['plan'].transform(x['MAIN_PLAN'])
    # students['MAIN_PLAN'] = all_dicts['plan'].transform(students['MAIN_PLAN'])
    # teachers['MAIN_PLAN'] = all_dicts['plan'].transform(teachers['MAIN_PLAN'])

all_dicts['teacher'] = LabelEncoder().fit(np.concatenate((teachers['ISU'].unique(), marks['PRED_ID'].unique())))
teachers['ISU'] = all_dicts['teacher'].transform(teachers['ISU'])
marks['PRED_ID'] = all_dicts['teacher'].transform(marks['PRED_ID'])

# train['DISC+SEMESTER'] = train['DISC_ID'].astype('str')+'_'+train['SEMESTER'].astype('str')
train['DISC+student'] = train['DISC_ID'].astype('str')+'_'+train['student_id'].astype('str')
test['DISC+student'] = test['DISC_ID'].astype('str')+'_'+test['student_id'].astype('str')

marks['MARK'] = marks['MARK'].replace({'зачет': '5', 'неявка': '2', 'незач': '2', 'осв': '3'})
train['MARK'] = train['MARK'].replace({'зачет': '5', 'неявка': '2', 'незач': '2', 'осв': '3'})
train.loc[~train['MARK'].isna(), 'MARK'] = train.loc[~train['MARK'].isna(), 'MARK'].astype('int')
marks = marks.drop(['PRED_ID', 'MAIN_PLAN'], axis=1)
train = train.drop(['TYPE_NAME'], axis=1)

In [4]:
display('TRAIN', train.head())
display('DISC', disc.head())
display('MARKS', marks.head())
display('STUDENTS', students.head())
display('PORTRAIT', portrait.head())
display('TEACHERS', teachers.head())
display('TEST', test.head())

'TRAIN'

Unnamed: 0,student_id,ST_YEAR,SEMESTER,DISC_ID,MARK,target,DISC+student
0,8197,2020,1,2659,5,0,2659_8197
1,8197,2020,1,400,5,0,400_8197
2,8197,2020,1,2406,5,0,2406_8197
3,8197,2020,1,2245,5,0,2245_8197
4,8197,2020,1,4523,3,0,4523_8197


'DISC'

Unnamed: 0,PLAN_ID,DISC_ID,СHOICE,SEMESTER,DISC_NAME,DISC_DEP,KEYWORD_NAMES
0,647,3190,1.0,6.0,Методы криптографии,8139328495281102994,"криптография, криптография, криптография, клас..."
1,647,2105,1.0,6.0,Анализ социальных сетей,8139328495281102994,"технологии работы информацией, алгоритмы обраб..."
2,647,2985,1.0,6.0,Эффективное участие в научных и практических к...,1687590892619214362,
3,647,4614,1.0,6.0,Управление карьерой,1687590892619214362,
4,647,4511,1.0,6.0,Обработка изображений,8139328495281102994,"классическое машинное обучение, генерация изоб..."


'MARKS'

Unnamed: 0,student_id,ST_YEAR,SEMESTER,TYPE_NAME,MARK,DISC_ID
0,8197,2020,1,Зачет,5,2659
1,8197,2020,1,Зачет,5,400
2,8197,2020,1,Зачет,5,2406
3,8197,2020,1,Зачет,5,2245
4,8197,2020,1,Экзамен,3,4523


'STUDENTS'

Unnamed: 0,student_id,KURS,DATE_START,DATE_END,PRIZNAK,MAIN_PLAN
0,1854,2,2017-09-01,2018-08-31 00:00:00,обучен,82
1,1854,3,2018-09-01,2018-11-02 00:00:00,обучен,82
2,1854,3,2018-11-03,2019-08-31 00:00:00,обучен,82
3,1854,4,2019-09-01,2020-07-06 00:00:00,обучен,82
4,2727,2,2017-09-01,2018-08-31 00:00:00,обучен,101


'PORTRAIT'

Unnamed: 0,student_id,GENDER,CITIZENSHIP,EXAM_TYPE,EXAM_SUBJECT_1,EXAM_SUBJECT_2,EXAM_SUBJECT_3,ADMITTED_EXAM_1,ADMITTED_EXAM_2,ADMITTED_EXAM_3,ADMITTED_SUBJECT_PRIZE_LEVEL,REGION_ID
0,4024,Ж,15601729049989747827,ЕГЭ,70786669040476600,5533732657842394915,8388269026169219461,88.0,91.0,81.0,ЕГЭ,4877310761925081124
1,8506,М,15601729049989747827,ВИ,70786669040476600,5533732657842394915,8388269026169219461,84.0,89.0,96.0,ЕГЭ,4877310761925081124
2,8187,М,15601729049989747827,ВИ,70786669040476600,5533732657842394915,8388269026169219461,66.0,78.0,93.0,ЕГЭ,4877310761925081124
3,1854,М,15601729049989747827,ВИ,70786669040476600,5533732657842394915,8388269026169219461,100.0,89.0,80.0,ЕГЭ,4877310761925081124
4,2727,М,15601729049989747827,ВИ,70786669040476600,5533732657842394915,8388269026169219461,84.0,86.0,91.0,ЕГЭ,4877310761925081124


'TEACHERS'

Unnamed: 0,ISU,GENDER,DATE_BIRTH,ST_YEAR,SEMESTER,DISC_ID,MAIN_PLAN,TYPE_NAME,MARK
0,843,М,1975,2020/2021,4.0,3346,511,Дифференцированный зачет,3.471698
1,843,М,1975,2018/2019,2.0,2281,780,Дифференцированный зачет,4.243902
2,843,М,1975,2019/2020,4.0,1224,165,Экзамен,4.652174
3,843,М,1975,2018/2019,2.0,631,165,Дифференцированный зачет,4.266667
4,843,М,1975,2018/2019,2.0,631,165,Курсовой проект,4.266667


'TEST'

Unnamed: 0,student_id,ST_YEAR,DISC_ID,TYPE_NAME,DISC+student
0,8,2021,134,Экзамен,134_8
1,8,2021,1406,Зачет,1406_8
2,8,2021,1911,Зачет,1911_8
3,8,2021,2245,Зачет,2245_8
4,8,2021,2279,Зачет,2279_8


In [5]:
avg_debts_disc = train.groupby('DISC_ID')['target'].mean().to_frame().rename({'target': 'avg_debts_disc'}, axis=1)
avg_marks_disc = train.groupby('DISC_ID')['MARK'].mean().to_frame().rename({'MARK': 'avg_mark_disc'}, axis=1)
avg_stud_marks = train.groupby('student_id')['MARK'].mean().to_frame().rename({'MARK': 'avg_stud_mark'}, axis=1)
avg_stud_target = train.groupby('student_id')['target'].mean().to_frame().rename({'target': 'avg_stud_target'}, axis=1)
num_marks = train.groupby('student_id')['MARK'].count().to_frame().rename({'MARK': 'num_marks'}, axis=1)

train = train.merge(avg_marks_disc, how='left', left_on='DISC_ID', right_index=True).merge(
                        avg_debts_disc, how='left', left_on='DISC_ID', right_index=True).merge(
                            avg_stud_marks, how='left', left_on='student_id', right_index=True).merge(
                                avg_stud_target, how='left', left_on='student_id', right_index=True).merge(
                                    num_marks, how='left', left_on='student_id', right_index=True)

In [6]:
train_data = train[['target', 'avg_mark_disc', 'avg_debts_disc', 'avg_stud_mark', 'avg_stud_target', 'num_marks']]

X_train, X_valid, y_train, y_valid = train_test_split(train_data.drop('target', axis=1), train_data['target'], test_size=0.2, random_state=42)

model = CatBoostClassifier(
    depth=3,
    eval_metric='F1',
    random_seed=42
)
model.fit(Pool(X_train, y_train), eval_set=Pool(X_valid, y_valid), verbose=100)

Learning rate set to 0.108029
0:	learn: 0.5013790	test: 0.4992200	best: 0.4992200 (0)	total: 68.1ms	remaining: 1m 8s
100:	learn: 0.5830866	test: 0.5885006	best: 0.5893865 (97)	total: 918ms	remaining: 8.17s
200:	learn: 0.6049959	test: 0.6038159	best: 0.6038719 (167)	total: 1.64s	remaining: 6.52s
300:	learn: 0.6189268	test: 0.6119403	best: 0.6124580 (299)	total: 2.35s	remaining: 5.45s
400:	learn: 0.6264430	test: 0.6145251	best: 0.6150985 (396)	total: 3.06s	remaining: 4.57s
500:	learn: 0.6323898	test: 0.6185185	best: 0.6189770 (475)	total: 3.76s	remaining: 3.75s
600:	learn: 0.6382675	test: 0.6183967	best: 0.6205109 (538)	total: 4.47s	remaining: 2.97s
700:	learn: 0.6439266	test: 0.6208165	best: 0.6212735 (636)	total: 5.19s	remaining: 2.21s
800:	learn: 0.6505599	test: 0.6205635	best: 0.6218487 (796)	total: 5.9s	remaining: 1.47s
900:	learn: 0.6559916	test: 0.6232626	best: 0.6239941 (896)	total: 6.63s	remaining: 728ms
999:	learn: 0.6583304	test: 0.6221249	best: 0.6239941 (896)	total: 7.33s	re

<catboost.core.CatBoostClassifier at 0x111447370>

In [7]:
test = test.merge(avg_marks_disc, how='left', left_on='DISC_ID', right_index=True).merge(
                        avg_debts_disc, how='left', left_on='DISC_ID', right_index=True).merge(
                            avg_stud_marks, how='left', left_on='student_id', right_index=True).merge(
                                avg_stud_target, how='left', left_on='student_id', right_index=True).merge(
                                    num_marks, how='left', left_on='student_id', right_index=True)

In [9]:
test['predictions'] = model.predict_proba(test[['avg_mark_disc', 'avg_debts_disc', 'avg_stud_mark', 'avg_stud_target', 'num_marks']])[:, 1]

In [10]:
test['final_predictions'] = (test['predictions'] > 0.05).astype('int')

In [11]:
test['final_predictions'].value_counts()

0    34252
1     1997
Name: final_predictions, dtype: int64

In [12]:
sample_sub['DEBT'] = test['final_predictions']
sample_sub.to_csv('submit.csv', index=False)

Должно быть приблизительно 2200 единиц

In [None]:
sample_sub.to_csv('submit.csv', index=False)

In [None]:
from sklearn.metrics import f1_score
import numpy as np

true = np.zeros((36249))
true[:2211] = 1
preds = np.zeros((36249))
pad=850
preds[pad:pad+2000] = 1
f1_score(true, preds)