In [1]:
import pandas as pd
pd.options.display.max_rows = 100

import warnings
warnings.filterwarnings('ignore')

import numpy as np
SEED = 800

MEDIAN_TRAIN_DISTINCT_KEYDOWN = 8 # utilizaremos a mediana obtida no treino

In [2]:
test = pd.read_csv('../desafio/test.csv')
test.reset_index(inplace=True)
test.columns = ['index_origin','inputs']

In [3]:
df_keyup = pd.DataFrame(columns=['code','tick','index_origin'])
df_keydown = pd.DataFrame(columns=['code','tick','index_origin'])

# loop into dataframe to be able to transform dict into new dataframes
for index,row in test.iterrows():

    # keydown
    df_keydown = pd.concat([pd.DataFrame.from_dict(eval(row['inputs'])['keyboard']['keydown']),df_keydown],sort=False)
    df_keydown = df_keydown.fillna(index).astype(int)
    
    # keyup
    try:
        df_keyup = pd.concat([pd.DataFrame.from_dict(eval(row['inputs'])['keyboard']['keyup']),df_keyup],sort=False)
        df_keyup = df_keyup.fillna(index).astype(int)
    except:
        pass

In [4]:
print('Quantidade de train {}'.format(test.index_origin.count()))
print('Quantidade de Keydown {}'.format(df_keydown.tick.count()))
print('Quantidade de Keyup {}'.format(df_keyup.tick.count()))

Quantidade de train 17907
Quantidade de Keydown 177656
Quantidade de Keyup 169939


In [5]:
# Campo sequencial
df_keydown['seq'] = df_keydown.groupby(['index_origin'])['tick'].rank(ascending=True,method='first')
df_keyup['seq'] = df_keyup.groupby(['index_origin'])['tick'].rank(ascending=True,method='first')

# Tempo entre as teclas
df_keydown_merged = df_keydown.merge(df_keydown,on='index_origin')
df_keydown_grouped = pd.DataFrame(df_keydown_merged[df_keydown_merged.tick_y > df_keydown_merged.tick_x].groupby(
                                ['index_origin','code_x','tick_x','seq_x'])['tick_y'].agg('min')).reset_index().sort_values(by=['index_origin','tick_x'])

df_keydown_grouped['diff_next_key'] = df_keydown_grouped['tick_y'] - df_keydown_grouped['tick_x']
df_keydown_grouped.columns = ['index_origin','code','tick','seq','tick_next','diff_next_key']

In [6]:
# Tempo até a próxima tecla
df_keydown = df_keydown.merge(df_keydown_grouped,how='outer',on=['index_origin','code','seq'])
df_keydown['diff_next_key'][df_keydown.tick_next.isnull()] = 0

In [7]:
# Tempo em que o usuário ficou pressionando a tecla
df_keydown = df_keydown.merge(df_keyup,on=['index_origin','seq'],how='left')

In [8]:
df_keydown['diff_keyup'] = 0
df_keydown['diff_keyup'] = df_keydown['tick'] - df_keydown['tick_x'] 

In [9]:
# Keydown sem keyup
df_missing_keyup = df_keydown[df_keydown.diff_keyup.isnull()].groupby(['index_origin']).code_x.agg(['count']).reset_index()
df_missing_keyup.columns = ['index_origin','number_missing_keyup']

# qtd keydown e keyup
df_number_keydown = df_keydown.groupby(['index_origin']).code_x.agg(['count']).reset_index()
df_number_keydown.columns = ['index_origin','number_keydown']

df_number_keyup = df_keyup.groupby(['index_origin']).code.agg(['count']).reset_index()
df_number_keyup.columns = ['index_origin','number_keyup']

# Tempo total digitação
df_max_keydown = df_keydown.groupby(['index_origin'])['tick_x'].agg(['max']).reset_index()

# qtd distinta de teclas
df_distinct_keydown = pd.DataFrame(df_keydown.groupby(['index_origin']).code_x.nunique()).reset_index()
df_distinct_keydown.columns = ['index_origin','number_distinct_keydown']

In [10]:
# Média, mediana e desvio padrão das próx. teclas e do release
df_mean_media_next_key = df_keydown.groupby(['index_origin'])['diff_next_key'].agg(['mean','median','std']).reset_index()
df_mean_media_next_key.columns = ['index_origin','mean_next_key','median_next_key','std_next_key']

df_mean_media_keyup = df_keydown.groupby(['index_origin'])['diff_keyup'].agg(['mean','median','std']).reset_index()
df_mean_media_keyup.columns = ['index_origin','mean_keyup','median_keyup','std_keyup']

In [11]:
# Consolidar dataframe
test = test.merge(df_number_keydown,on = ['index_origin'])
test = test.merge(df_number_keyup,on = ['index_origin'],how='left')
test = test.merge(df_missing_keyup,on = ['index_origin'],how='left')
test = test.merge(df_max_keydown,on = ['index_origin'],how='left')
test = test.merge(df_distinct_keydown,on = ['index_origin'],how='left')
test = test.merge(df_mean_media_next_key,on = ['index_origin'],how='left')
test = test.merge(df_mean_media_keyup,on = ['index_origin'],how='left')

In [12]:
# Separação do perfil de digitação
test['distinct_key_type'] = 0
test['distinct_key_type'][test['number_distinct_keydown'] > MEDIAN_TRAIN_DISTINCT_KEYDOWN] = 1 # we use median from train (not test)

In [13]:
# Missing
test['number_missing_keyup'][test.number_missing_keyup.isnull()] = 0
test['number_keyup'][test.number_keyup.isnull()] = 0
test['mean_keyup'][test.mean_keyup.isnull()] = 0
test['median_keyup'][test.median_keyup.isnull()] = 0
test['std_keyup'][test.std_keyup.isnull()] = 0

In [14]:
# Clusterização para separar outliers
from sklearn import preprocessing
from sklearn.cluster import KMeans

df_keydown['seq_scaled'] = 0
df_keydown['diff_next_key_scaled'] = 0

scaler = preprocessing.StandardScaler()

df_keydown[['seq_scaled','diff_next_key_scaled']] = scaler.fit_transform(df_keydown[['seq','diff_next_key']])

X = df_keydown[['seq','diff_next_key']]

kmeans = KMeans(n_clusters=5, random_state=SEED).fit(X)
kmeans.labels_
df_keydown['CLUSTER'] = kmeans.labels_

In [16]:
index_outliers = df_keydown[df_keydown.CLUSTER>0].index_origin.values
test['outlier'] = 0
test['outlier'][test.index_origin.isin(index_outliers)] = 1

## Model

In [21]:
from sklearn.externals import joblib
model = joblib.load('classification_rf_model_20180729.pkl')

In [22]:
columns_to_remove = ['inputs','index_origin']

X = test.drop(columns_to_remove,axis=1)

In [23]:
y_pred = model.predict(X)

In [24]:
pd.DataFrame(y_pred,columns=['target']).to_csv('../desafio/result.csv',index=False)