In [1]:
import pandas as pd
import numpy as np

In [2]:
%matplotlib inline
import matplotlib as plt
import seaborn as sns

In [3]:
sns.set(rc={'figure.figsize': (18, 6)})

## 1. Analysis##

In [4]:
events_data = pd.read_csv('~/Documents/Stepik + ODS/data/event_data_train.csv')
submissions_data = pd.read_csv('~/Documents/Stepik + ODS/data/submissions_data_train.csv')

In [5]:
events_data['date'] = pd.to_datetime(events_data['timestamp'], unit='s')
submissions_data['date'] = pd.to_datetime(submissions_data['timestamp'], unit='s')

In [6]:
events_data['day'] = events_data['date'].dt.date
submissions_data['day'] = submissions_data['date'].dt.date

In [7]:
#events_data.groupby('day')['user_id'].nunique().plot();

In [8]:
users_events_data = events_data.pivot_table(index='user_id', columns='action', values='step_id', aggfunc='count', fill_value=0).\
                        reset_index()

In [9]:
users_scores = submissions_data.pivot_table(index='user_id', columns='submission_status',
                                           values='step_id', aggfunc='count', fill_value=0).reset_index()

In [10]:
gap_data = events_data[['user_id', 'day', 'timestamp']].drop_duplicates(subset=['user_id', 'day']).\
            groupby('user_id')['timestamp'].apply(list).\
            apply(np.diff).values

In [11]:
gap_data = pd.Series(np.concatenate(gap_data, axis=0))

In [12]:
gap_data = gap_data / (24*60*60)

In [13]:
#gap_data[gap_data < 30].hist();

In [14]:
gap_data.quantile(0.90)

18.325995370370403

In [15]:
karpov = events_data[events_data['timestamp'] <= 1451606400]\
                .groupby('user_id')['day'].count()\
                .sort_values(ascending=False).head(20)

## 2. Data processing##

### 2.1. X_train, y_train ###

In [16]:
users_data = events_data.groupby('user_id', as_index=False).agg({'timestamp' : 'max'})\
            .rename(columns={'timestamp' : 'last_timestamp'})

In [17]:
now = 1526772811 #GMT: Saturday, May 19, 2018 11:33:31 PM 
drop_out_threshold = 2592000 #a month 60 sec x 60 min x 24 h x 30 days

In [18]:
users_data['is_gone_user'] = (now - users_data.last_timestamp) > drop_out_threshold

In [19]:
users_data = users_data.merge(users_scores, how='outer')

In [20]:
users_data = users_data.fillna(0)

In [21]:
users_data = users_data.merge(users_events_data, how='outer')

In [22]:
users_days = events_data.groupby('user_id').day.nunique().to_frame().reset_index()

In [23]:
users_data = users_data.merge(users_days, how='outer')

In [24]:
users_data['passed_corse'] = users_data.passed >= 20

In [25]:
users_data[users_data['passed_corse'] == True]['day'].median()

9.0

In [26]:
user_min_time = events_data.groupby('user_id', as_index=False).\
                agg({'timestamp' : 'min'}).rename({'timestamp' : 'min_timestamp'}, axis=1)

In [27]:
users_data = users_data.merge(user_min_time, how='outer')

In [28]:
event_data_train = events_data

In [29]:
events_data['user_time'] = events_data.user_id.map(str) + "_" + events_data.timestamp.map(str)

In [30]:
learning_time_threshold = 5*24*60*60 #!!!!!!!

In [31]:
user_learning_time_threshold = user_min_time.user_id.map(str) + "_" \
                            + (user_min_time.min_timestamp + learning_time_threshold).map(str)

In [32]:
user_min_time['user_learning_time_threshold'] = user_learning_time_threshold

In [33]:
events_data = events_data.merge(user_min_time[['user_id', 'user_learning_time_threshold']], 
                                                    how='outer')

In [34]:
events_data_train = events_data[events_data.user_time <= events_data.user_learning_time_threshold]

In [35]:
events_data_train.head()

Unnamed: 0,step_id,timestamp,action,user_id,date,day,user_time,user_learning_time_threshold
0,32815,1434340848,viewed,17632,2015-06-15 04:00:48,2015-06-15,17632_1434340848,17632_1434772848
1,32815,1434340848,passed,17632,2015-06-15 04:00:48,2015-06-15,17632_1434340848,17632_1434772848
2,32815,1434340848,discovered,17632,2015-06-15 04:00:48,2015-06-15,17632_1434340848,17632_1434772848
3,32811,1434340895,discovered,17632,2015-06-15 04:01:35,2015-06-15,17632_1434340895,17632_1434772848
4,32811,1434340895,viewed,17632,2015-06-15 04:01:35,2015-06-15,17632_1434340895,17632_1434772848


In [36]:
submissions_data[submissions_data['submission_status'] == 'wrong'].groupby('step_id').count().\
                                    sort_values(by='timestamp', ascending=False)

Unnamed: 0_level_0,timestamp,submission_status,user_id,date,day
step_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
31978,16084,16084,16084,16084,16084
32031,13373,13373,13373,13373,13373
32202,13002,13002,13002,13002,13002
33481,10300,10300,10300,10300,10300
32075,10003,10003,10003,10003,10003
...,...,...,...,...,...
33482,707,707,707,707,707
33681,703,703,703,703,703
33362,673,673,673,673,673
32175,342,342,342,342,342


In [37]:
events_data_train.groupby('user_id')['day'].nunique().max()

6

In [38]:
submissions_data['users_time'] = submissions_data.user_id.map(str) + '_' + submissions_data.timestamp.map(str)
submissions_data = submissions_data.merge(user_min_time[['user_id', 'user_learning_time_threshold']], how='outer')
submissions_data_train = submissions_data[submissions_data.users_time <= submissions_data.user_learning_time_threshold]
submissions_data_train.groupby('user_id').day.nunique().max()

6

***MOST IMPORTANT MOMENT***
___

In [39]:
X = submissions_data_train.groupby('user_id')['day'].nunique().to_frame().reset_index()\
                    .rename(columns={'day' : 'days'})

In [40]:
steps_tried = submissions_data_train.groupby('user_id')['step_id'].nunique().to_frame().reset_index()\
                    .rename(columns={'step_id' : 'steps_tried'})

In [41]:
X = X.merge(steps_tried, on='user_id', how='outer')

In [42]:
X = X.merge(submissions_data_train.pivot_table(index='user_id', columns='submission_status',
                                           values='step_id', aggfunc='count', fill_value=0).reset_index())

In [43]:
X['correct_ratio'] = X.correct/(X.correct + X.wrong)

In [44]:
X = X.merge(events_data_train.pivot_table(index='user_id', columns='action', values='step_id', aggfunc='count',\
                                          fill_value=0).reset_index()[['user_id', 'viewed']], how='outer')

In [45]:
X = X.fillna(0)

In [46]:
X = X.merge(users_data[['user_id', 'passed_corse', 'is_gone_user']], how='outer')

In [47]:
X = X[~((X.is_gone_user == False) & (X.passed_corse == False))]

In [48]:
y = X.passed_corse.map(int)

In [49]:
X = X.drop(['passed_corse', 'is_gone_user'], axis=1)

In [50]:
X = X.set_index(X.user_id)

In [51]:
X = X.drop(['user_id'], axis=1)

In [52]:
from sklearn.model_selection import train_test_split

### 2.2. X_test, y_test ###

In [53]:
test_events_data = pd.read_csv('https://stepik.org/media/attachments/course/4852/events_data_test.csv')
test_submissions_data = pd.read_csv('https://stepik.org/media/attachments/course/4852/submission_data_test.csv')

In [54]:
test_events_data['date'] = pd.to_datetime(test_events_data['timestamp'], unit='s')
test_submissions_data['date'] = pd.to_datetime(test_submissions_data['timestamp'], unit='s')
test_events_data['day'] = test_events_data['date'].dt.date
test_submissions_data['day'] = test_submissions_data['date'].dt.date

In [55]:
X_test = test_submissions_data.groupby('user_id')['day'].nunique().to_frame().reset_index()\
                    .rename(columns={'day' : 'days'})

In [56]:
test_steps_tried = test_submissions_data.groupby('user_id')['step_id'].nunique().to_frame().reset_index()\
                    .rename(columns={'step_id' : 'steps_tried'})

In [57]:
X_test = X_test.merge(test_steps_tried, on='user_id', how='outer')

In [58]:
X_test = X_test.merge(test_submissions_data.pivot_table(index='user_id', columns='submission_status',
                                           values='step_id', aggfunc='count', fill_value=0).reset_index())

In [59]:
X_test['correct_ratio'] = X_test.correct/(X_test.correct + X_test.wrong)

In [60]:
X_test = X_test.merge(test_events_data.pivot_table(index='user_id', columns='action', values='step_id', aggfunc='count',\
                                          fill_value=0).reset_index()[['user_id', 'viewed']], how='outer')

In [61]:
X_test = X_test.fillna(0)

In [62]:
X_test = X_test.set_index(X_test.user_id)
X_test = X_test.drop(['user_id'], axis=1)

## 3. Prediction

### 3.1. Desicion Tree ###

In [95]:
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [96]:
parameters = {'max_depth' : range(1, 5)}

In [97]:
tree_clf = tree.DecisionTreeClassifier(criterion='entropy')
grid = GridSearchCV(tree_clf, parameters, cv=5, n_jobs=-1)

In [98]:
%%time
grid.fit(X, y)

CPU times: user 215 ms, sys: 153 ms, total: 368 ms
Wall time: 3.64 s


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='entropy',
                                              max_depth=None, max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=-1, param_grid={'max_depth': range(1, 5)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring

In [99]:
grid.best_params_

{'max_depth': 4}

In [100]:
best_clf = grid.best_estimator_

In [101]:
pd.DataFrame({'features':list(X.columns), 'feature_importances':  best_clf.feature_importances_})\
                        .sort_values(by='feature_importances')

Unnamed: 0,features,feature_importances
3,wrong,0.0
0,days,0.00146
2,correct,0.005557
4,correct_ratio,0.015783
1,steps_tried,0.094061
5,viewed,0.883138


In [102]:
tree.export_graphviz(best_clf, feature_names=list(X.columns), 
out_file='_tree.dot', filled=True)
!dot -Tpng '_tree.dot' -o '_tree.png'

![Альтернативный текст](_tree.png)

In [103]:
y_test = best_clf.predict(X_test)

In [104]:
y_predicted_prob = best_clf.predict_proba(X_test)

In [107]:
submission = pd.DataFrame(y_predicted_prob[:, 1], index=X_test.index,columns=['is_gone'])

In [108]:
submission.to_csv('out_decision_tree.csv')

***Your ROC score is 0.8623791376575091 with Decision Tree*** 

### 3.2. Random Forest ###

In [77]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [110]:
clf_rf = RandomForestClassifier()

In [111]:
parameters = {'max_depth' : range(6, 10), 'n_estimators' : range(80, 200, 20), 'max_features' : range(2, 4)}

In [112]:
#grid_rf = RandomizedSearchCV(clf_rf, parameters, cv=10, n_jobs=-1)
grid_rf = GridSearchCV(clf_rf, parameters, cv=skf, n_jobs=-1, scoring='roc_auc')

In [113]:
%%time
grid_rf.fit(X, y)

CPU times: user 3.04 s, sys: 150 ms, total: 3.19 s
Wall time: 1min 27s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
         

In [115]:
grid_rf.best_params_

{'max_depth': 7, 'max_features': 2, 'n_estimators': 160}

In [116]:
best_rf_clf = grid_rf.best_estimator_

In [117]:
pd.DataFrame({'features':list(X.columns), 'feature_importances':  best_rf_clf.feature_importances_})\
                        .sort_values(by='feature_importances')

Unnamed: 0,features,feature_importances
0,days,0.024214
4,correct_ratio,0.035888
3,wrong,0.10015
2,correct,0.196458
1,steps_tried,0.273353
5,viewed,0.369936


In [118]:
y_test = best_rf_clf.predict(X_test)

In [119]:
y_predicted_prob = best_rf_clf.predict_proba(X_test)

In [122]:
submission = pd.DataFrame(y_predicted_prob[:, 1], index=X_test.index,columns=['is_gone'])

In [123]:
submission.to_csv('out_random_forest.csv')

***Your best ROC score is 0.8831885535293477***



### 3.3. KNN ###

In [102]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [103]:
knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_jobs=-1))])

In [104]:
knn_params = {'knn__n_neighbors': range(1, 20)}

In [105]:
knn_grid = GridSearchCV(knn_pipe, knn_params, cv=5, n_jobs=-1, verbose=True)

In [106]:
knn_grid.fit(X, y)

Fitting 5 folds for each of 19 candidates, totalling 95 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done  95 out of  95 | elapsed:   27.4s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('knn',
                                        KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                             metric='minkowski',
                                                             metric_params=None,
                                                             n_jobs=-1,
                                                             n_neighbors=5, p=2,
                                                        

In [107]:
knn_grid.best_params_, knn_grid.best_score_

({'knn__n_neighbors': 18}, 0.8803308626060802)

In [108]:
best_knn_clf = knn_grid.best_estimator_

In [109]:
y_test = best_knn_clf.predict(X_test)

In [110]:
y_predicted_prob = best_knn_clf.predict_proba(X_test)

In [111]:
submission = pd.DataFrame(y_predicted_prob[:, 1], index=X_test.index,columns=['is_gone'])

In [112]:
submission.to_csv('out_kNN.csv')

***Your ROC score is 0.8306130334586076***

### 3.4. Logit model ###

In [80]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler

In [81]:
c_values = np.logspace(-3, 1, 10)
scaler = StandardScaler()
logit = LogisticRegressionCV(Cs=c_values, cv=5, verbose=1, n_jobs=-1)
logit.fit(scaler.fit_transform(X), y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.0s finished


LogisticRegressionCV(Cs=array([1.00000000e-03, 2.78255940e-03, 7.74263683e-03, 2.15443469e-02,
       5.99484250e-02, 1.66810054e-01, 4.64158883e-01, 1.29154967e+00,
       3.59381366e+00, 1.00000000e+01]),
                     class_weight=None,
                     cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
                     dual=False, fit_intercept=True, intercept_scaling=1.0,
                     l1_ratios=None, max_iter=100, multi_class='warn',
                     n_jobs=-1, penalty='l2', random_state=None, refit=True,
                     scoring=None, solver='lbfgs', tol=0.0001, verbose=1)

In [82]:
logit.C_

array([0.05994843])

In [84]:
y_predicted_prob = logit.predict_proba(scaler.transform(X_test))

In [85]:
submission = pd.DataFrame(y_predicted_prob[:, 1], index=X_test.index,columns=['is_gone'])

In [86]:
submission.to_csv('out_logit.csv')

### 3.5. SGD model ###

In [66]:
from sklearn.linear_model import SGDClassifier

In [140]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [141]:
%%time
sgd_log  = SGDClassifier(loss='log')
sgd_log.fit(X_train_scaled, y)

CPU times: user 61.4 ms, sys: 71 µs, total: 61.4 ms
Wall time: 60.5 ms


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [142]:
y_predicted_prob = sgd_log.predict_proba(X_test_scaled)

In [143]:
submission = pd.DataFrame(y_predicted_prob[:, 1], index=X_test.index,columns=['is_gone'])

In [144]:
submission.to_csv('sgd.csv')

***Your ROC score is 0.8770699053569371***

In [63]:
from xgboost import XGBClassifier

In [66]:
xgb = XGBClassifier().fit(X, y)

In [72]:
params = {
        'min_child_weight': [10, 15, 20],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.4, 0.6, 0.8, 1.0],
        'colsample_bytree': [0.1, 0.5, 1.0],
        'max_depth': [3, 5, 10],
        'learning_rate' : np.arange(0.1, 1, 0.2)
        }

In [73]:
xgb = XGBClassifier(n_estimators = 500, objective='binary:logistic')

In [78]:
random_search = RandomizedSearchCV(xgb, param_distributions=params, 
                                   scoring='roc_auc', 
                                   cv=5, 
                                   verbose=0)

In [79]:
random_search.fit(X, y)

RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=500,...
                                           random_state=None, reg_alpha=None,
                                           reg_lambda=None,
                                           

In [82]:
random_search.best_params_
best_xgb = random_search.best_estimator_

In [83]:
submission = pd.DataFrame(best_xgb.predict_proba(X_test)[:, 1], index=X_test.index,columns=['is_gone']).to_csv('xgb.csv')