In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv('data/train.csv').rename({'DEBT': 'target'}, axis=1)
sample_sub = pd.read_csv('data/sample_submission.csv')
disc = pd.read_csv('data/comp_disc.csv')
marks = pd.read_csv('data/comp_marks.csv')
portrait = pd.read_csv('data/comp_portrait.csv')
students = pd.read_csv('data/comp_students.csv', date_parser='DATE_START')
teachers = pd.read_csv('data/comp_teachers.csv')
test = pd.read_csv('data/test.csv')

In [3]:
all_dicts = {}
all_dicts['student_id'] = LabelEncoder().fit(portrait['ISU'])
for x in [marks, train, students, portrait, test]:
    x['ISU'] = all_dicts['student_id'].transform(x['ISU'])
for x in [marks, train, students, portrait, test]:
    x.rename({'ISU': 'student_id'}, axis=1, inplace=True)

all_dicts['disc_id'] = LabelEncoder().fit(np.concatenate((teachers['DISC_ID'].unique(),disc['DISC_ID'].unique(), test['DISC_ID'].unique())))
for x in [train, disc, teachers, marks, test]:
    x['DISC_ID'] = all_dicts['disc_id'].transform(x['DISC_ID'])

all_dicts['plan'] = LabelEncoder().fit(np.concatenate((disc['PLAN_ID'].unique(),students['MAIN_PLAN'].unique())))
disc['PLAN_ID'] = all_dicts['plan'].transform(disc['PLAN_ID'])
for x in [marks, students, teachers]:
    x['MAIN_PLAN'] = all_dicts['plan'].transform(x['MAIN_PLAN'])

all_dicts['teacher'] = LabelEncoder().fit(np.concatenate((teachers['ISU'].unique(), marks['PRED_ID'].unique())))
teachers['ISU'] = all_dicts['teacher'].transform(teachers['ISU'])
marks['PRED_ID'] = all_dicts['teacher'].transform(marks['PRED_ID'])

all_dicts['type_name'] = LabelEncoder().fit(train['TYPE_NAME'])
train['TYPE_NAME'] = all_dicts['type_name'].transform(train['TYPE_NAME'])
test['TYPE_NAME'] = all_dicts['type_name'].transform(test['TYPE_NAME'])

all_dicts['gender'] = LabelEncoder().fit(portrait['GENDER'])
portrait['GENDER'] = all_dicts['gender'].transform(portrait['GENDER'])

all_dicts['exam_type'] = LabelEncoder().fit(portrait['EXAM_TYPE'])
portrait['EXAM_TYPE'] = all_dicts['exam_type'].transform(portrait['EXAM_TYPE'])

train['SEM+DISC+TYPE'] = train['SEMESTER'].astype('str')+'_'+train['DISC_ID'].astype('str')+'_'+train['TYPE_NAME'].astype('str')

<h3>СТУДЕНТЫ БЕЗ ИНФЫ</h3>

In [5]:
problem_stud = train[train['SEMESTER']==1]

In [6]:
problem_stud

Unnamed: 0,student_id,ST_YEAR,SEMESTER,DISC_ID,TYPE_NAME,MARK,target,SEM+DISC+TYPE
0,8197,2020,1,2659,1,зачет,0,1_2659_1
1,8197,2020,1,400,1,зачет,0,1_400_1
2,8197,2020,1,2406,1,зачет,0,1_2406_1
3,8197,2020,1,2245,1,зачет,0,1_2245_1
4,8197,2020,1,4523,3,3,0,1_4523_3
...,...,...,...,...,...,...,...,...
179164,1818,2019,1,1539,3,4,0,1_1539_3
179166,1818,2019,1,134,3,3,0,1_134_3
179168,1818,2019,1,1937,1,зачет,0,1_1937_1
179170,1818,2019,1,1692,3,5,0,1_1692_3


In [30]:
tmp = portrait[(portrait['ADMITTED_EXAM_1']>0)&(portrait['ADMITTED_EXAM_2']>0)&(portrait['ADMITTED_EXAM_3']>0)]
portrait.loc[(portrait['ADMITTED_EXAM_1']>0)&(portrait['ADMITTED_EXAM_2']>0)&(portrait['ADMITTED_EXAM_3']>0), 'SUM_EXAMS'] = tmp['ADMITTED_EXAM_1'] + tmp['ADMITTED_EXAM_2'] + tmp['ADMITTED_EXAM_3']
portrait.loc[(portrait['EXAM_TYPE']==2), 'SUM_EXAMS'] = 300


In [33]:
train = train.merge(portrait[['student_id', 'SUM_EXAMS']], how='left', left_on='student_id', right_on='student_id')

In [53]:
train[train['SEMESTER']==1]

Unnamed: 0,student_id,ST_YEAR,SEMESTER,DISC_ID,TYPE_NAME,MARK,target,DISC+SEMESTER,SUM_EXAMS
0,8197,2020,1,2659,1,зачет,0,2659_1,248.0
1,8197,2020,1,400,1,зачет,0,400_1,248.0
2,8197,2020,1,2406,1,зачет,0,2406_1,248.0
3,8197,2020,1,2245,1,зачет,0,2245_1,248.0
4,8197,2020,1,4523,3,3,0,4523_1,248.0
...,...,...,...,...,...,...,...,...,...
179164,1818,2019,1,1539,3,4,0,1539_1,256.0
179166,1818,2019,1,134,3,3,0,134_1,256.0
179168,1818,2019,1,1937,1,зачет,0,1937_1,256.0
179170,1818,2019,1,1692,3,5,0,1692_1,256.0


In [48]:
test[~test['student_id'].isin(train.student_id)]['SEMESTER']

163      1
164      1
165      1
166      1
167      1
        ..
36232    1
36233    1
36234    1
36235    1
36236    1
Name: SEMESTER, Length: 5229, dtype: int64

In [44]:
test

Unnamed: 0,student_id,ST_YEAR,SEMESTER,DISC_ID,TYPE_NAME
0,8,2021,3,134,3
1,8,2021,3,1406,1
2,8,2021,3,1911,1
3,8,2021,3,2245,1
4,8,2021,3,2279,1
...,...,...,...,...,...
36244,8580,2021,5,2279,1
36245,8580,2021,5,2851,1
36246,8580,2021,5,2916,1
36247,8580,2021,5,2988,3


In [None]:
train['DISC+student'] = train['DISC_ID'].astype('str')+'_'+train['student_id'].astype('str')
test['DISC+student'] = test['DISC_ID'].astype('str')+'_'+test['student_id'].astype('str')
train['DISC+TYPE'] = train['DISC_ID'].astype('str')+'_'+train['TYPE_NAME'].astype('str')
test['DISC+TYPE'] = test['DISC_ID'].astype('str')+'_'+test['TYPE_NAME'].astype('str')
train['student+TYPE'] = train['student_id'].astype('str')+'_'+train['TYPE_NAME'].astype('str')
test['student+TYPE'] = test['student_id'].astype('str')+'_'+test['TYPE_NAME'].astype('str')
train['DISC+SEMESTER'] = train['DISC_ID'].astype('str')+'_'+train['SEMESTER'].astype('str')

marks['MARK'] = marks['MARK'].replace({'зачет': '5', 'неявка': '2', 'незач': '2', 'осв': None})
train['MARK'] = train['MARK'].replace({'зачет': '5', 'неявка': '2', 'незач': '2', 'осв': None})
train.loc[~train['MARK'].isna(), 'MARK'] = train.loc[~train['MARK'].isna(), 'MARK'].astype('int')