# Proceo de Limpieza

In [1]:
# imports
import pandas as pd
from collections import Counter
import numpy as np

In [2]:
data_test = pd.read_csv('../data/test_format2.csv')
data_train = pd.read_csv('../data/train_format2.csv')

data_test.head()

Unnamed: 0,user_id,age_range,gender,merchant_id,label,activity_log
0,163968,0.0,0.0,4378,-1.0,101206:812:6968:0614:0
1,163968,0.0,0.0,2300,-1.0,588758:844:3833:0618:0#71782:844:3833:1111:2#7...
2,163968,0.0,0.0,1551,-1.0,312747:243:1954:0627:0#312747:243:1954:0627:0#...
3,163968,0.0,0.0,4343,-1.0,932390:1612:3201:0628:0
4,163968,0.0,0.0,4911,-1.0,957657:662:3089:0612:0


In [3]:
data_train.head()

Unnamed: 0,user_id,age_range,gender,merchant_id,label,activity_log
0,34176,6.0,0.0,944,-1,408895:1505:7370:1107:0
1,34176,6.0,0.0,412,-1,17235:1604:4396:0818:0#954723:1604:4396:0818:0...
2,34176,6.0,0.0,1945,-1,231901:662:2758:0818:0#231901:662:2758:0818:0#...
3,34176,6.0,0.0,4752,-1,174142:821:6938:1027:0
4,34176,6.0,0.0,643,-1,716371:1505:968:1024:3


# Validaciones rapidas para entender como estan los datos

## Visión rápida: tamaños, tipos y nulos

In [4]:
# Tamaños y tipos
for name, df in {"train": data_train, "test": data_test}.items():
    print(f"=== {name.upper()} ===")
    print("shape:", df.shape)
    print(df.dtypes)
    print()

=== TRAIN ===
shape: (7030723, 6)
user_id           int64
age_range       float64
gender          float64
merchant_id       int64
label             int64
activity_log     object
dtype: object

=== TEST ===
shape: (7027943, 6)
user_id           int64
age_range       float64
gender          float64
merchant_id       int64
label           float64
activity_log     object
dtype: object



In [5]:
# Conteo de nulos por columna
print("=== NULLS TRAIN ===")
print(data_train.isna().sum().sort_values(ascending=False))
print("\n=== NULLS TEST ===")
print(data_test.isna().sum().sort_values(ascending=False))

=== NULLS TRAIN ===
gender          61712
age_range       19380
activity_log     2975
user_id             0
merchant_id         0
label               0
dtype: int64

=== NULLS TEST ===
label           261477
gender           63250
age_range        19420
activity_log      3006
user_id              0
merchant_id          0
dtype: int64


## Variables clave: distribución de label (train), age_range, gender

In [6]:
# Distribución de label en TRAIN (incluye -1 si hay)
print("=== LABEL value_counts (TRAIN) ===")
print(data_train['label'].value_counts(dropna=False))

# Edad y género (TRAIN y TEST)
def vc_with_na(s):
    return s.value_counts(dropna=False).sort_index()

print("\n=== age_range (TRAIN) ===")
print(vc_with_na(data_train['age_range']))

print("\n=== age_range (TEST) ===")
print(vc_with_na(data_test['age_range']))

print("\n=== gender (TRAIN) ===")
print(vc_with_na(data_train['gender']))

print("\n=== gender (TEST) ===")
print(vc_with_na(data_test['gender']))


=== LABEL value_counts (TRAIN) ===
label
-1    6769859
 0     244912
 1      15952
Name: count, dtype: int64

=== age_range (TRAIN) ===
age_range
0.0    1351842
1.0        286
2.0     731938
3.0    1913722
4.0    1459923
5.0     752927
6.0     655922
7.0     124493
8.0      20290
NaN      19380
Name: count, dtype: int64

=== age_range (TEST) ===
age_range
0.0    1345565
1.0        260
2.0     733323
3.0    1916611
4.0    1460542
5.0     752608
6.0     650358
7.0     128644
8.0      20612
NaN      19420
Name: count, dtype: int64

=== gender (TRAIN) ===
gender
0.0    5101730
1.0    1618110
2.0     249171
NaN      61712
Name: count, dtype: int64

=== gender (TEST) ===
gender
0.0    5062667
1.0    1643382
2.0     258644
NaN      63250
Name: count, dtype: int64


## activity_log: calidad básica (vacíos, longitud, ejemplos)

In [7]:
# Longitud de log por registro (nº de interacciones por fila)
def count_interactions(x):
    if pd.isna(x) or x == '':
        return 0
    return len(str(x).split('#'))

for name, df in {"train": data_train, "test": data_test}.items():
    df['activity_len'] = df['activity_log'].apply(count_interactions)
    print(f"=== {name.upper()} activity_len ===")
    print(df['activity_len'].describe())
    print("Ceros (sin interacciones):", (df['activity_len'] == 0).sum())
    print()

# Muestra de 3 filas con logs no vacíos
print("=== EJEMPLOS activity_log (TRAIN) ===")
print(data_train.loc[data_train['activity_len']>0, 'activity_log'].head(3).to_list())

=== TRAIN activity_len ===
count    7.030723e+06
mean     3.894925e+00
std      1.214883e+01
min      0.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      3.000000e+00
max      6.963000e+03
Name: activity_len, dtype: float64
Ceros (sin interacciones): 2975

=== TEST activity_len ===
count    7.027943e+06
mean     3.905862e+00
std      1.275945e+01
min      0.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      3.000000e+00
max      7.688000e+03
Name: activity_len, dtype: float64
Ceros (sin interacciones): 3006

=== EJEMPLOS activity_log (TRAIN) ===
['408895:1505:7370:1107:0', '17235:1604:4396:0818:0#954723:1604:4396:0818:0#275437:1604:4396:0818:0#548906:1577:4396:1031:0#368206:662:4396:0818:0#480007:1604:4396:0818:0#954723:1604:4396:0818:0#236488:1505:4396:1024:0', '231901:662:2758:0818:0#231901:662:2758:0818:0#108465:662:2758:0820:0#231901:662:2758:0820:0#231901:662:2758:0820:0#840446:1142:2758:0820:0#231901:662:2758:0819:0']


## Parseo mínimo del activity_log: validar formato y action_type

In [8]:
def parse_records(log):
    """
    Devuelve lista de tuplas (item_id, category_id, brand_id, time_stamp, action_type)
    o [] si vacío/malformado.
    """
    if pd.isna(log) or log == '':
        return []
    recs = []
    for rec in str(log).split('#'):
        parts = rec.split(':')
        if len(parts) == 5:
            recs.append(tuple(parts))
        else:
            recs.append(None)
    return recs

def sample_action_types(df, n_rows=2000):
    atypes = Counter()
    malformed = 0
    for log in df['activity_log'].head(n_rows):
        recs = parse_records(log)
        for r in recs:
            if r is None:
                malformed += 1
            else:
                atypes.update([r[4]])
    return atypes, malformed

print("=== ACTION TYPES (TRAIN, muestra 2000 filas) ===")
atypes_train, malformed_train = sample_action_types(data_train, 2000)
print("malformados (TRAIN muestra):", malformed_train)
print("action_types (TRAIN muestra):", atypes_train.most_common(20))

print("\n=== ACTION TYPES (TEST, muestra 2000 filas) ===")
atypes_test, malformed_test = sample_action_types(data_test, 2000)
print("malformados (TEST muestra):", malformed_test)
print("action_types (TEST muestra):", atypes_test.most_common(20))

=== ACTION TYPES (TRAIN, muestra 2000 filas) ===
malformados (TRAIN muestra): 0
action_types (TRAIN muestra): [('0', 6551), ('3', 503), ('2', 463)]

=== ACTION TYPES (TEST, muestra 2000 filas) ===
malformados (TEST muestra): 0
action_types (TEST muestra): [('0', 6860), ('3', 601), ('2', 417)]


## Validación de time_stamp: rango temporal, parseo y errores

In [9]:
def extract_timestamps(df, n_rows=2000):
    tvals = []
    bad = 0
    for log in df['activity_log'].head(n_rows):
        recs = parse_records(log)
        for r in recs:
            if r is None:
                continue
            tvals.append(r[3])
    ts = pd.to_datetime(pd.Series(tvals), errors='coerce', infer_datetime_format=True)
    bad = ts.isna().sum()
    return ts, bad

print("=== TIMESTAMPS (TRAIN muestra 2000 filas) ===")
ts_train, bad_train = extract_timestamps(data_train, 2000)
print("parseables:", ts_train.notna().sum(), "no parseables:", bad_train)
if ts_train.notna().any():
    print("min:", ts_train.min(), "max:", ts_train.max())

print("\n=== TIMESTAMPS (TEST muestra 2000 filas) ===")
ts_test, bad_test = extract_timestamps(data_test, 2000)
print("parseables:", ts_test.notna().sum(), "no parseables:", bad_test)
if ts_test.notna().any():
    print("min:", ts_test.min(), "max:", ts_test.max())

=== TIMESTAMPS (TRAIN muestra 2000 filas) ===
parseables: 0 no parseables: 7517

=== TIMESTAMPS (TEST muestra 2000 filas) ===
parseables: 0 no parseables: 7878


  ts = pd.to_datetime(pd.Series(tvals), errors='coerce', infer_datetime_format=True)
  ts = pd.to_datetime(pd.Series(tvals), errors='coerce', infer_datetime_format=True)
  ts = pd.to_datetime(pd.Series(tvals), errors='coerce', infer_datetime_format=True)
  ts = pd.to_datetime(pd.Series(tvals), errors='coerce', infer_datetime_format=True)


## Duplicados potenciales (por par usuario–comerciante)

In [10]:
print("=== Duplicados exactos ===")
print("TRAIN:", data_train.duplicated().sum())
print("TEST:", data_test.duplicated().sum())

def dup_pairs(df):
    if all(c in df.columns for c in ['user_id', 'merchant_id']):
        return df.duplicated(subset=['user_id','merchant_id']).sum()
    return None

print("\n=== Duplicados por (user_id, merchant_id) ===")
print("TRAIN:", dup_pairs(data_train))
print("TEST:", dup_pairs(data_test))

=== Duplicados exactos ===
TRAIN: 0
TEST: 0

=== Duplicados por (user_id, merchant_id) ===
TRAIN: 0
TEST: 0


1) Dimensiones: Ambos datasets son muy grandes (~7 millones de filas cada uno), con estructura consistente.

2) Nulos:

    - gender y age_range tienen decenas de miles de valores faltantes.

    - activity_log también presenta algunos registros vacíos (~3k).

    - En test, label aparece con NULL en ~261k filas (esperado).

3) Distribución de la etiqueta (label):

    - En train la mayoría de registros tienen label = -1 (no son clientes nuevos → se usan como contexto, no para entrenar directamente).

    - Los repetidores (1) son muy pocos comparados con los no repetidores (0), lo que genera fuerte desbalance.

4) Edad y género:

    - age_range: muchos usuarios con valor 0 (desconocido) o NaN.

    - gender: predominan mujeres (0), con una proporción notable de desconocidos (2 o NaN).

5) Activity log:

    - Casi todos los registros tienen entre 1–3 interacciones, pero hay outliers con miles de eventos.

    - action_type está limpio y toma solo valores 0, 2, 3.

6) Timestamps:

    - Ninguno se parseó como fecha → parecen codificados en un formato distinto (ejemplo: 0818, 1031, 1107).

    - Indica probablemente formato MMDD (mes-día).

7) Duplicados: No existen duplicados exactos ni por par user_id-merchant_id.

# Limpieza

## Tratamiento de nulos en age_range y gender
Son campos categóricos con valores desconocidos. En lugar de eliminarlos (porque perderíamos datos), se asigna una categoría especial.

In [11]:
data_train['age_range'] = data_train['age_range'].fillna(0)
data_test['age_range'] = data_test['age_range'].fillna(0)

data_train['gender'] = data_train['gender'].fillna(2)
data_test['gender'] = data_test['gender'].fillna(2)

## Revisar y normalizar categorías de age_range y gender
- age_range: ya trae un "0" explícito para desconocido.

- gender: 2 y NaN significan desconocido → hacerlos 2.

In [12]:
data_train['age_range'] = data_train['age_range'].astype(int)
data_test['age_range'] = data_test['age_range'].astype(int)

data_train['gender'] = data_train['gender'].astype(int)
data_test['gender'] = data_test['gender'].astype(int)

## Filtrar label
- En train, -1 no es parte de la predicción.

- Se mantiene aparte como información contextual, pero para entrenar el modelo usamos solo 0 y 1.

In [13]:
train_clean = data_train[data_train['label'].isin([0, 1])].copy()

## Tratar activity_log vacío
- Los casos con activity_log vacío (≈3k filas) no aportan comportamiento.

- Se conservan asignando longitud = 0, porque representan usuarios sin actividad.

In [14]:
data_train['activity_log'] = data_train['activity_log'].fillna('')
data_test['activity_log'] = data_test['activity_log'].fillna('')

## Manejo de outliers en activity_len
- La mayoría tiene ≤ 10 interacciones, pero existen usuarios con miles.

- Recortar (cap) el valor máximo a un percentil alto (ej. P99).

In [15]:
cap_train = data_train['activity_len'].quantile(0.99)
cap_test = data_test['activity_len'].quantile(0.99)

data_train['activity_len'] = data_train['activity_len'].clip(upper=cap_train)
data_test['activity_len'] = data_test['activity_len'].clip(upper=cap_test)

In [16]:
data_train.head()

Unnamed: 0,user_id,age_range,gender,merchant_id,label,activity_log,activity_len
0,34176,6,0,944,-1,408895:1505:7370:1107:0,1
1,34176,6,0,412,-1,17235:1604:4396:0818:0#954723:1604:4396:0818:0...,8
2,34176,6,0,1945,-1,231901:662:2758:0818:0#231901:662:2758:0818:0#...,7
3,34176,6,0,4752,-1,174142:821:6938:1027:0,1
4,34176,6,0,643,-1,716371:1505:968:1024:3,1


In [17]:
data_test.head()

Unnamed: 0,user_id,age_range,gender,merchant_id,label,activity_log,activity_len
0,163968,0,0,4378,-1.0,101206:812:6968:0614:0,1
1,163968,0,0,2300,-1.0,588758:844:3833:0618:0#71782:844:3833:1111:2#7...,11
2,163968,0,0,1551,-1.0,312747:243:1954:0627:0#312747:243:1954:0627:0#...,5
3,163968,0,0,4343,-1.0,932390:1612:3201:0628:0,1
4,163968,0,0,4911,-1.0,957657:662:3089:0612:0,1


In [18]:
SRC_LOG_COL = 'activity_log'

# Helper para resumir un log (string) a un dict con métricas
def summarize_log_string(s: str):
    if not s:
        return {
            'actions_0': 0, 'actions_2': 0, 'actions_3': 0,
            'unique_items': 0, 'unique_categories': 0, 'unique_brands': 0,
            'date_min': pd.NaT, 'date_max': pd.NaT, 'day_span': np.int64(0),
            'has_1111': 0
        }
    a0 = a2 = a3 = 0
    items, cats, brands = set(), set(), set()
    dmin = dmax = None
    has1111 = 0

    for rec in s.split('#'):
        toks = rec.split(':')
        if len(toks) != 5:
            continue
        it, cat, br, dt, act = toks

        # acciones
        if act == '0':
            a0 += 1
        elif act == '2':
            a2 += 1
        elif act == '3':
            a3 += 1

        # sets
        if it.isdigit():  items.add(int(it))
        if cat.isdigit(): cats.add(int(cat))
        if br.isdigit():  brands.add(int(br))

        # fechas (si ya vienen YYYY-MM-DD; si no, quedarán NaT)
        try:
            d = pd.to_datetime(dt, errors='raise')
            dmin = d if (dmin is None or d < dmin) else dmin
            dmax = d if (dmax is None or d > dmax) else dmax
            if (d.month, d.day) == (11, 11):
                has1111 = 1
        except Exception:
            pass

    day_span = int((dmax - dmin).days) if (dmin is not None and dmax is not None) else 0

    return {
        'actions_0': a0, 'actions_2': a2, 'actions_3': a3,
        'unique_items': len(items), 'unique_categories': len(cats), 'unique_brands': len(brands),
        'date_min': dmin if dmin is not None else pd.NaT,
        'date_max': dmax if dmax is not None else pd.NaT,
        'day_span': np.int64(day_span),
        'has_1111': has1111
    }

In [19]:
# Aplicar en batches para evitar problemas de memoria/tiempo
def expand_activity_columns_inplace(df: pd.DataFrame, src_col: str = SRC_LOG_COL, batch_size: int = 500_000):
    # Asegurar string y no nulos
    df[src_col] = df[src_col].fillna('').astype(str)

    n = len(df)
    parts = []
    for start in range(0, n, batch_size):
        end = min(start + batch_size, n)
        chunk = df.iloc[start:end].copy()

        # aplicar resumen fila a fila
        res = chunk[src_col].apply(summarize_log_string).apply(pd.Series)

        # concat chunk + nuevas columnas
        chunk_out = pd.concat([chunk.reset_index(drop=True), res.reset_index(drop=True)], axis=1)
        parts.append(chunk_out)
        print(f"Batch {start:,}..{end:,} procesado -> {chunk_out.shape}")

    out = pd.concat(parts, axis=0, ignore_index=True)

    # 4) Insertar nuevas columnas DESPUÉS de activity_len y eliminar el src_col
    new_cols = ['actions_0','actions_2','actions_3',
                'unique_items','unique_categories','unique_brands',
                'date_min','date_max','day_span','has_1111']

    cols = list(out.columns)
    # quitar src_col para no duplicarlo al reordenar
    cols.remove(src_col)

    # asegurar que activity_len exista (recalcular si no)
    if 'activity_len' not in out.columns:
        out['activity_len'] = out[src_col].apply(lambda x: 0 if x == '' else len(x.split('#')))

    # construir orden: todo hasta activity_len, luego nuevas columnas, luego el resto sin duplicar
    idx = cols.index('activity_len')
    left  = cols[:idx+1]
    right = [c for c in cols[idx+1:] if c not in new_cols]
    ordered = left + new_cols + right

    out = out[ordered]

    return out

In [20]:
# Ejecutar para TRAIN y TEST
data_train_expanded = expand_activity_columns_inplace(data_train, src_col=SRC_LOG_COL, batch_size=500_000)
data_test_expanded  = expand_activity_columns_inplace(data_test,  src_col=SRC_LOG_COL, batch_size=500_000)

print("TRAIN final:", data_train_expanded.shape)
print("TEST  final:", data_test_expanded.shape)

Batch 0..500,000 procesado -> (500000, 17)
Batch 500,000..1,000,000 procesado -> (500000, 17)
Batch 1,000,000..1,500,000 procesado -> (500000, 17)
Batch 1,500,000..2,000,000 procesado -> (500000, 17)
Batch 2,000,000..2,500,000 procesado -> (500000, 17)
Batch 2,500,000..3,000,000 procesado -> (500000, 17)
Batch 3,000,000..3,500,000 procesado -> (500000, 17)
Batch 3,500,000..4,000,000 procesado -> (500000, 17)
Batch 4,000,000..4,500,000 procesado -> (500000, 17)
Batch 4,500,000..5,000,000 procesado -> (500000, 17)
Batch 5,000,000..5,500,000 procesado -> (500000, 17)
Batch 5,500,000..6,000,000 procesado -> (500000, 17)
Batch 6,000,000..6,500,000 procesado -> (500000, 17)
Batch 6,500,000..7,000,000 procesado -> (500000, 17)
Batch 7,000,000..7,030,723 procesado -> (30723, 17)
Batch 0..500,000 procesado -> (500000, 17)
Batch 500,000..1,000,000 procesado -> (500000, 17)
Batch 1,000,000..1,500,000 procesado -> (500000, 17)
Batch 1,500,000..2,000,000 procesado -> (500000, 17)
Batch 2,000,000..2

In [24]:
display(data_train_expanded.head())
display(data_test_expanded.head())

Unnamed: 0,user_id,age_range,gender,merchant_id,label,activity_len,actions_0,actions_2,actions_3,unique_items,unique_categories,unique_brands,date_min,date_max,day_span,has_1111
0,34176,6,0,944,-1,1,1,0,0,1,1,1,NaT,NaT,0,0
1,34176,6,0,412,-1,8,8,0,0,7,4,1,NaT,NaT,0,0
2,34176,6,0,1945,-1,7,7,0,0,3,2,1,NaT,NaT,0,0
3,34176,6,0,4752,-1,1,1,0,0,1,1,1,NaT,NaT,0,0
4,34176,6,0,643,-1,1,0,0,1,1,1,1,NaT,NaT,0,0


Unnamed: 0,user_id,age_range,gender,merchant_id,label,activity_len,actions_0,actions_2,actions_3,unique_items,unique_categories,unique_brands,date_min,date_max,day_span,has_1111
0,163968,0,0,4378,-1.0,1,1,0,0,1,1,1,NaT,NaT,0,0
1,163968,0,0,2300,-1.0,11,9,2,0,4,1,1,NaT,NaT,0,0
2,163968,0,0,1551,-1.0,5,4,1,0,1,1,1,NaT,NaT,0,0
3,163968,0,0,4343,-1.0,1,1,0,0,1,1,1,NaT,NaT,0,0
4,163968,0,0,4911,-1.0,1,1,0,0,1,1,1,NaT,NaT,0,0


In [28]:
from datetime import datetime

# --- helpers ---
YEAR = 2014

def mmdd_to_date_safe(s, year=YEAR):
    s = '' if s is None else str(s)
    if len(s) == 4 and s.isdigit():
        try:
            return datetime.strptime(f"{year}{s}", "%Y%m%d")
        except ValueError:
            return None
    # si viniera YYYY-MM-DD o YYYYMMDD, intenta parsear:
    try:
        return pd.to_datetime(s, errors='raise').to_pydatetime()
    except Exception:
        return None

def recompute_dates_from_log_tuple(log: str):
    if not log:
        return (None, None, 0, 0)
    dmin = dmax = None
    has1111 = 0
    for rec in str(log).split('#'):
        toks = rec.split(':')
        if len(toks) != 5:
            continue
        d = mmdd_to_date_safe(toks[3])
        if d:
            dmin = d if dmin is None or d < dmin else dmin
            dmax = d if dmax is None or d > dmax else dmax
            if d.month == 11 and d.day == 11:
                has1111 = 1
    span = (dmax - dmin).days if (dmin and dmax) else 0
    return (dmin, dmax, span, has1111)

def fill_dates_inplace(dest_df: pd.DataFrame, src_logs: pd.Series, batch=200_000):
    # crea columnas si no existen
    for c in ['date_min','date_max','day_span','has_1111']:
        if c not in dest_df.columns:
            dest_df[c] = pd.NaT if c in ('date_min','date_max') else 0

    n = len(dest_df)
    for start in range(0, n, batch):
        end = min(start + batch, n)
        logs = src_logs.iloc[start:end].fillna('')

        # calcular sin DataFrames intermedios
        mins, maxs, spans, flags = [], [], [], []
        for s in logs:
            dmin, dmax, span, flag = recompute_dates_from_log_tuple(s)
            mins.append(dmin); maxs.append(dmax); spans.append(span); flags.append(flag)

        idx = dest_df.index[start:end]
        dest_df.loc[idx, 'date_min']  = pd.to_datetime(mins, errors='coerce')
        dest_df.loc[idx, 'date_max']  = pd.to_datetime(maxs, errors='coerce')
        dest_df.loc[idx, 'day_span']  = np.asarray(spans, dtype='int32')
        dest_df.loc[idx, 'has_1111']  = np.asarray(flags, dtype='int8')

        print(f"Fechas {start:,}..{end:,} OK")

In [29]:
# --- elegir columna fuente de logs en los originales ---
SRC_TRAIN = 'activity_log_norm' if 'activity_log_norm' in data_train.columns else 'activity_log'
SRC_TEST  = 'activity_log_norm' if 'activity_log_norm' in data_test.columns  else 'activity_log'

# --- aplicar SOLO a las columnas malas, en bloque ---
fill_dates_inplace(data_train_expanded, data_train[SRC_TRAIN], batch=200_000)
fill_dates_inplace(data_test_expanded,  data_test[SRC_TEST],  batch=200_000)

# verificación rápida
print("train non-null date_min:", data_train_expanded['date_min'].notna().sum())
print("test  non-null date_min:",  data_test_expanded['date_min'].notna().sum())

Fechas 0..200,000 OK
Fechas 200,000..400,000 OK
Fechas 400,000..600,000 OK
Fechas 600,000..800,000 OK
Fechas 800,000..1,000,000 OK
Fechas 1,000,000..1,200,000 OK
Fechas 1,200,000..1,400,000 OK
Fechas 1,400,000..1,600,000 OK
Fechas 1,600,000..1,800,000 OK
Fechas 1,800,000..2,000,000 OK
Fechas 2,000,000..2,200,000 OK
Fechas 2,200,000..2,400,000 OK
Fechas 2,400,000..2,600,000 OK
Fechas 2,600,000..2,800,000 OK
Fechas 2,800,000..3,000,000 OK
Fechas 3,000,000..3,200,000 OK
Fechas 3,200,000..3,400,000 OK
Fechas 3,400,000..3,600,000 OK
Fechas 3,600,000..3,800,000 OK
Fechas 3,800,000..4,000,000 OK
Fechas 4,000,000..4,200,000 OK
Fechas 4,200,000..4,400,000 OK
Fechas 4,400,000..4,600,000 OK
Fechas 4,600,000..4,800,000 OK
Fechas 4,800,000..5,000,000 OK
Fechas 5,000,000..5,200,000 OK
Fechas 5,200,000..5,400,000 OK
Fechas 5,400,000..5,600,000 OK
Fechas 5,600,000..5,800,000 OK
Fechas 5,800,000..6,000,000 OK
Fechas 6,000,000..6,200,000 OK
Fechas 6,200,000..6,400,000 OK
Fechas 6,400,000..6,600,000 OK
F

In [30]:
display(data_train_expanded.head())
display(data_test_expanded.head())

Unnamed: 0,user_id,age_range,gender,merchant_id,label,activity_len,actions_0,actions_2,actions_3,unique_items,unique_categories,unique_brands,date_min,date_max,day_span,has_1111
0,34176,6,0,944,-1,1,1,0,0,1,1,1,2014-11-07,2014-11-07,0,0
1,34176,6,0,412,-1,8,8,0,0,7,4,1,2014-08-18,2014-10-31,74,0
2,34176,6,0,1945,-1,7,7,0,0,3,2,1,2014-08-18,2014-08-20,2,0
3,34176,6,0,4752,-1,1,1,0,0,1,1,1,2014-10-27,2014-10-27,0,0
4,34176,6,0,643,-1,1,0,0,1,1,1,1,2014-10-24,2014-10-24,0,0


Unnamed: 0,user_id,age_range,gender,merchant_id,label,activity_len,actions_0,actions_2,actions_3,unique_items,unique_categories,unique_brands,date_min,date_max,day_span,has_1111
0,163968,0,0,4378,-1.0,1,1,0,0,1,1,1,2014-06-14,2014-06-14,0,0
1,163968,0,0,2300,-1.0,11,9,2,0,4,1,1,2014-06-18,2014-11-11,146,1
2,163968,0,0,1551,-1.0,5,4,1,0,1,1,1,2014-06-27,2014-06-27,0,0
3,163968,0,0,4343,-1.0,1,1,0,0,1,1,1,2014-06-28,2014-06-28,0,0
4,163968,0,0,4911,-1.0,1,1,0,0,1,1,1,2014-06-12,2014-06-12,0,0


In [32]:
data_train_expanded.to_csv('../data/train_clean.csv', index=False) 
data_test_expanded.to_csv('../data/test_clean.csv', index=False)