In [62]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

In [85]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [64]:
# Load data
event_data_train = pd.read_csv('event_data_train.zip', compression='zip')
submission_data_train = pd.read_csv('submissions_data_train.zip', compression='zip')
event_data_test = pd.read_csv('events_data_test.csv')
submission_data_test = pd.read_csv('submission_data_test.csv')

In [65]:
#Converting timestamp to datetime, add day, add minimal timestamp
def add_info(df):
    df['date'] = pd.to_datetime(df.timestamp, unit='s')
    df['day'] = df.date.dt.date
    min_time = df.groupby('user_id', as_index=False).agg({'timestamp': 'min'}).rename(columns={'timestamp': 'min_timestamp'})
    df = df.merge(min_time, on='user_id', how='outer')
    return df

event_data_train = add_info(event_data_train)
submission_data_train = add_info(submission_data_train)
event_data_test = add_info(event_data_test)
submission_data_test = add_info(submission_data_test)

In [66]:
# Figure out what users earn 40 points in train data
train_user_score = event_data_train.pivot_table(index='user_id', columns='action', values='step_id', aggfunc='count', fill_value=0)
train_user_score['is_gone'] = train_user_score.passed > 40
train_user_score = train_user_score['is_gone'].astype(int).reset_index()

In [67]:
# Select first 2 days from train data
def select_two_days(df):
    threshold = 2 * 24 * 60 * 60
    df = df.query('timestamp - min_timestamp <= @threshold')
    return df
event_data_train = select_two_days(event_data_train)
submission_data_train = select_two_days(submission_data_train)

In [68]:
def calculate_points(event, submission):
    df = pd.DataFrame({'user_id': event.user_id.unique()})
    user_event = event.pivot_table(index='user_id', columns='action', values='step_id', aggfunc='count', fill_value=0)
    df = df.merge(user_event.reset_index(), on='user_id', how='outer')
    user_submission = submission.pivot_table(index='user_id', columns='submission_status', values='step_id', aggfunc='count', fill_value=0)
    df = df.merge(user_submission.reset_index(), on='user_id', how='outer')
    days = event.groupby('user_id').day.nunique().to_frame().rename(columns={'day': 'days'})
    df = df.merge(days.reset_index(), on='user_id', how='outer')
    step_tried = submission.groupby('user_id').step_id.nunique().to_frame().rename(columns={'step_id': 'step_tried'})
    df = df.merge(step_tried.reset_index(), on='user_id', how='outer')
    df = df.fillna(0)
    return df
    
train_data = calculate_points(event_data_train, submission_data_train)
test_data = calculate_points(event_data_test, submission_data_test)

In [89]:
rf = RandomForestClassifier()
params = {'n_estimators': range(10, 31, 5), 
          'max_depth': range(1, 9, 2), 
          'min_samples_leaf': range(1, 7), 
          'min_samples_split': range(2, 7)}
search = GridSearchCV(rf, params, cv=3)

search.fit(train_data, train_user_score.is_gone)

predicted = search.predict_proba(test_data)
submission = pd.DataFrame({'user_id': test_data.user_id, 'is_gone': predicted[:,0]})
submission.to_csv('submission.csv', index=False)

In [87]:
dt = DecisionTreeClassifier()
params = {'max_depth': range(1, 9, 2), 
          'min_samples_leaf': range(1, 7), 
          'min_samples_split': range(2, 7)}
search = GridSearchCV(dt, params, cv=3)

search.fit(train_data, train_user_score.is_gone)

predicted = search.predict_proba(test_data)
submission = pd.DataFrame({'user_id': test_data.user_id, 'is_gone': predicted[:,0]})
submission.to_csv('submission.csv', index=False)