In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
# Desactivo warning molesto
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
eventos = pd.read_csv('/home/lautaro/Desktop/events_up_to_01062018.csv', low_memory=False)

In [4]:
labels = pd.read_csv('/home/lautaro/Desktop/labels_training_set.csv')

In [5]:
eventos["timestamp"] = pd.to_datetime(eventos["timestamp"])

In [6]:
eventos['marca'] = eventos['model'].str.split(' ', n = 1, expand = True)[0]

In [7]:
eventos['Samsung'] = (eventos['marca'] == 'Samsung')
eventos['iPhone'] = (eventos['marca'] == 'iPhone')
eventos['LG'] = (eventos['marca'] == 'LG')
eventos['Motorola'] = (eventos['marca'] == 'Motorola')
eventos['iPad'] = (eventos['marca'] == 'iPad')
eventos['Sony'] = (eventos['marca'] == 'Sony')
eventos['Lenovo'] = (eventos['marca'] == 'Lenovo')
eventos['Quantum'] = (eventos['marca'] == 'Quantum')
eventos['Asus'] = (eventos['marca'] == 'Asus')
eventos['Xiaomi'] = (eventos['marca'] == 'Xiaomi')
eventos['Outros'] = (eventos['marca'] == 'Outros')

In [8]:
eventos['viewed_product'] = (eventos['event'] == 'viewed product')
eventos['brand_listing'] = (eventos['event'] == 'brand listing')
eventos['visited_site'] = (eventos['event'] == 'visited site')
eventos['ad_campaign_hit'] = (eventos['event'] == 'ad campaign hit')
eventos['generic_listing'] = (eventos['event'] == 'generic listing')
eventos['searched_products'] = (eventos['event'] == 'searched products')
eventos['search_engine_hit'] = (eventos['event'] == 'search engine hit')
eventos['checkout'] = (eventos['event'] == 'checkout')
eventos['staticpage'] = (eventos['event'] == 'staticpage')
eventos['conversion'] = (eventos['event'] == 'conversion')
eventos['lead'] = (eventos['event'] == 'lead')

In [9]:
eventos["mes"] = eventos["timestamp"].dt.month
eventos["dia"] = eventos["timestamp"].dt.dayofyear
eventos["semana"] = eventos["timestamp"].dt.week

In [10]:
# Obtengo la ultima fecha disponible en el csv para tener un punto de partida
ultima_fecha = eventos["timestamp"].max()
ultimo_dia = ultima_fecha.dayofyear
ultimo_mes = ultima_fecha.month
ultima_semana = ultima_fecha.week

In [11]:
# Agrego columnas para los m meses más cercanos
for m in range (0, ultimo_mes):
    eventos["mes_pasado_" + str(m)] = (eventos["mes"] == (ultimo_mes - m))

In [12]:
# Agrego columnas para las s semanas más cercanas
for s in range (0, 5):
    eventos["semana_pasada_" + str(s)] = (eventos["semana"] == (ultima_semana - s))

In [13]:
# Agrego columnas para los d días más cercanos
for d in range (0, 7):
    eventos["dia_pasado_" + str(d)] = (eventos["dia"] == (ultimo_dia - d))

In [14]:
eventos = eventos.loc[:,['person', 'viewed_product', 'brand_listing', 'visited_site',
       'ad_campaign_hit', 'generic_listing', 'searched_products',
       'Samsung', 'iPhone', 'LG', 'Motorola', 'iPad', 'Sony',
       'Lenovo', 'Quantum', 'Asus', 'Xiaomi', 'Outros',
       'search_engine_hit', 'checkout', 'conversion', 'lead',
       'mes_pasado_0', 'mes_pasado_1', 'mes_pasado_2', 'mes_pasado_3', 'mes_pasado_4'
       'semana_pasada_0', 'semana_pasada_1', 'semana_pasada_2',
       'semana_pasada_3', 'semana_pasada_4',
       'dia_pasado_0', 'dia_pasado_1', 'dia_pasado_2', 'dia_pasado_3', 'dia_pasado_4', 
        'dia_pasado_5', 'dia_pasado_6']]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [17]:
usuarios = eventos.groupby('person', as_index=False).sum()

In [18]:
# Incluyo solo los eventos de los usuarios que tengo labels
usuarios_con_labels = usuarios.loc[usuarios['person'].isin(labels['person'])]

In [19]:
usuarios_con_labels = usuarios_con_labels.set_index('person')

In [20]:
labels = labels.set_index('person')

In [22]:
usuarios_con_labels = usuarios_con_labels.join(labels, lsuffix='person', rsuffix='person')

In [23]:
X = usuarios_con_labels.iloc[:,:-1]

In [24]:
Y = usuarios_con_labels.iloc[:,-1]

In [27]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=123)

In [21]:
# FIN DE LA PREPARACION DE LOS DATAFRAMES

In [42]:
from sklearn.linear_model import LogisticRegression

In [43]:
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [57]:
preds = logreg.predict_proba(x_test)[:,1]

In [58]:
preds

array([0.026709  , 0.05572017, 0.04391651, ..., 0.05452287, 0.0418393 ,
       0.04552204])

In [59]:
logreg.score(x_test, y_test)

0.949781097089879

In [60]:
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds)
metrics.auc(fpr, tpr)

0.7223535534592979

In [61]:
# Ahora tengo que predecir los de prueba de Kaggle
ids_usuarios_kaggle = pd.read_csv('/home/lautaro/Desktop/trocafone_kaggle_test.csv')

In [62]:
x_test_kaggle = usuarios.loc[usuarios['person'].isin(ids_usuarios_kaggle['person']), :]

In [63]:
x_test_kaggle = x_test_kaggle.set_index('person')

In [64]:
# Predecimos
preds_kaggle = logreg.predict_proba(x_test_kaggle)[:,1]

In [65]:
preds_kaggle

array([0.03460014, 0.05076453, 0.0196846 , ..., 0.05068812, 0.08932465,
       0.04287326])

In [66]:
pd.DataFrame({'person': x_test_kaggle.index, 'label': preds_kaggle}).to_csv('test.csv', index=False, columns=['person', 'label'])

In [67]:
# Score en Kaggle = 0.68951