In [107]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [108]:
events_data_test = pd.read_csv('https://stepik.org/media/attachments/course/4852/events_data_test.csv')
submission_data_test = pd.read_csv('https://stepik.org/media/attachments/course/4852/submission_data_test.csv')

events_data = pd.read_csv('https://stepik.org/media/attachments/course/4852/event_data_train.zip')
submission_data = pd.read_csv('https://stepik.org/media/attachments/course/4852/submissions_data_train.zip')

In [118]:
def time_df(df):    
    user_time = df.groupby('user_id').agg({'timestamp':'min'}).rename(columns = {'timestamp':'first_timestamp'}).reset_index()
    2*24*60*60
    
    data = df.merge(user_time, on = 'user_id', how = 'outer').query('timestamp <= first_timestamp + 2*24*60*60')
    
    return data

In [110]:
def feature(events_data, submission_data):
    users_scores = submission_data.pivot_table(index = 'user_id', columns = 'submission_status', values = 'step_id', aggfunc = 'count', fill_value = 0)\
.reset_index()
    users_events_data = events_data.pivot_table(index = 'user_id', columns = 'action', values = 'step_id', aggfunc = 'count', fill_value = 0)\
.reset_index()
    
    users_data = users_scores.merge(users_events_data,on = 'user_id', how = 'outer').fillna(0)
    
    return users_data

In [111]:
def target(submission_data, count_pass = 40):
    user_count_correct = submission_data.query('submission_status == "correct"').groupby('user_id').agg({'submission_status':'count'})\
    .reset_index().rename(columns = {'submission_status':'correct_count'})
    
    user_count_correct['passed_course'] = user_count_correct.correct_count > count_pass
    
    return user_count_correct

In [112]:
def steps_count(submission_data):
    steps_count = submission_data.groupby('user_id').step_id.nunique().to_frame().reset_index().rename(
    columns = {'step_id':'steps_count'})
    return steps_count

In [113]:
def correct_ratio(data):
    data['correct_ratio'] = (data.correct / (data.correct + data.wrong)).fillna(0)
    return data

In [114]:
def create_df(events_data, submission_data):
    event_to_date = time_df(events_data)
    submit_to_date = time_df(submission_data)
    
    users_data = feature(event_to_date, submit_to_date)
    
    users_target = target(submission_data)
    
    users_step_tried = steps_count(submit_to_date)
    users_data = correct_ratio(users_data)
    
    X = users_data.merge(users_step_tried, how = 'outer').fillna(0)
    X = X.merge(users_target, how = 'outer').fillna(0)
    
    y = X['passed_course'].map(int)
    X = X.drop('passed_course', axis = 1)
    return X,y

In [119]:
X_train, y = create_df(events_data, submission_data)

In [120]:
X_train

Unnamed: 0,user_id,correct,wrong,discovered,passed,started_attempt,viewed,correct_ratio,steps_count,correct_count
0,2,2.0,0.0,9,9,2,9,1.0,2.0,2.0
1,3,4.0,4.0,15,15,4,20,0.5,4.0,29.0
2,5,2.0,2.0,1,1,0,1,0.5,2.0,2.0
3,8,9.0,21.0,109,84,37,154,0.3,11.0,9.0
4,14,0.0,1.0,4,3,1,9,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
19229,26773,0.0,0.0,1,1,0,1,0.0,0.0,0.0
19230,26774,0.0,0.0,1,1,0,1,0.0,0.0,0.0
19231,26788,0.0,0.0,1,1,0,1,0.0,0.0,0.0
19232,26789,0.0,0.0,2,2,0,2,0.0,0.0,0.0


In [121]:
X_test, p = create_df(events_data_test, submission_data_test)

In [122]:
X_test

Unnamed: 0,user_id,correct,wrong,discovered,passed,started_attempt,viewed,correct_ratio,steps_count,correct_count
0,12,1.0,0.0,11,9,4,14,1.000000,1.0,1.0
1,13,29.0,36.0,70,70,35,105,0.446154,29.0,29.0
2,15,10.0,30.0,1,1,0,1,0.250000,11.0,10.0
3,21,24.0,103.0,74,68,70,98,0.188976,30.0,24.0
4,35,7.0,35.0,34,30,11,70,0.166667,9.0,7.0
...,...,...,...,...,...,...,...,...,...,...
6179,26745,0.0,0.0,1,1,0,1,0.000000,0.0,0.0
6180,26768,0.0,0.0,1,1,0,1,0.000000,0.0,0.0
6181,26791,0.0,0.0,1,1,0,1,0.000000,0.0,0.0
6182,26795,0.0,0.0,1,1,0,1,0.000000,0.0,0.0


In [93]:
def fit(train_data, y, size = 0.2):
    X_train, X_test, y_train, y_test = train_test_split(train_data, y, test_size=size)
    rf_clf = RandomForestClassifier()
    parameters = {'max_depth':range(1,20,5), 'n_estimators':range(50,300,75),'min_samples_split':range(20,221,50),'min_samples_leaf':range(10,101,20)}
    gs = GridSearchCV(rf_clf,parameters,n_jobs = -1, cv = 5)
    
    gs.fit(X_train, y_train)
    
    best_clf = gs.best_estimator_
    
    ypred = gs.predict(X_test)
    roc_score = roc_auc_score(ypred, y_test)
    score = gs.score(X_test, y_test)
    
    print(score)
    print(roc_score)

In [94]:
fit(X_train, y)

1.0
1.0


In [125]:
def final_fit(train_data, y, test_data, size = 0.2):
    X_train, X_test, y_train, y_test = train_test_split(train_data, y, test_size=size)
    rf_clf = RandomForestClassifier()
    parameters = {'max_depth':range(1,50,5), 'n_estimators':range(50,500,50),'min_samples_split':range(20,521,50),'min_samples_leaf':range(10,201,10)}
    gs = GridSearchCV(rf_clf,parameters,n_jobs = -1, cv = 5)
    
    gs.fit(X_train, y_train)
    
    best_clf = gs.best_estimator_
    
    ypred_prob = best_clf.predict_proba(X_test)
    
    roc_score = roc_auc_score(y_test, ypred_prob[:, 1])
    score = best_clf.score(X_test, y_test)
    print(f"Правильность на валид наборе: {score:.3f}")
    print(f"Roc_auc_score на валид наборе: {roc_score:.5f}")
    
    ypred_prob_final = best_clf.predict_proba(test_data)
    result = test_data['user_id'].to_frame()
    result['is_gone'] = ypred_prob_final[:, 1]
    result[['user_id', 'is_gone']].to_csv(f'my_predict_{roc_score:.5f}.csv', index=False)
    print(f'Результы записанны в файл my_predict_{roc_score:.5f}.csv')

In [126]:
final_fit(X_train, y, X_test)

Правильность на валид наборе: 1.000
Roc_auc_score на валид наборе: 1.00000
Результы записанны в файл my_predict_1.00000.csv
