In [None]:
import pandas as pd
import numpy as np
from google.colab import drive

In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval

# Путь к данным
DATA_PATH = "/content/"

# 1. Загрузка train_labels
train_labels = pd.read_csv(DATA_PATH + "train_labels.csv", sep=";")
print(f"train_labels: {train_labels.shape}")
print(train_labels.head())

# 2. Загрузка train и test с обработкой user_agent
def parse_user_agent(ua_str):
    try:
        # Исправляем форматирование строки для корректного парсинга
        ua_str = ua_str.replace("'", '"').replace('None', '"None"')
        return literal_eval(ua_str)
    except:
        return {'browser': None, 'browser_version': None, 'os': None, 'os_version': None}

train = pd.read_csv(DATA_PATH + "train.csv", sep=";", on_bad_lines='skip')
test = pd.read_csv(DATA_PATH + "test.csv", sep=";", on_bad_lines='skip')

# Применяем парсер к user_agent
train['user_agent'] = train['user_agent'].apply(parse_user_agent)
test['user_agent'] = test['user_agent'].apply(parse_user_agent)

print(f"\ntrain: {train.shape}")
print(train.head())
print(f"\ntest: {test.shape}")
print(test.head())

# 3. Загрузка referer_vectors
referer_vectors = pd.read_csv(DATA_PATH + "referer_vectors.csv", sep=";")
print(f"\nreferer_vectors: {referer_vectors.shape}")
print(referer_vectors.head())

# 4. Загрузка geo_info
geo_info = pd.read_csv(DATA_PATH + "geo_info.csv", sep=";", dtype=str)
print(f"\ngeo_info: {geo_info.shape}")
print(geo_info.head())

# 5. Загрузка test_users
test_users = pd.read_csv(DATA_PATH + "test_users.csv", sep=";")
print(f"\ntest_users: {test_users.shape}")
print(test_users.head())

# 6. Разделим train_labels на отдельные столбцы
if 'user_id;target' in train_labels.columns:
    train_labels[['user_id', 'target']] = train_labels['user_id;target'].str.split(';', expand=True)
    train_labels = train_labels[['user_id', 'target']]
    train_labels['target'] = train_labels['target'].astype(int)

print("\nОбновленный train_labels:")
print(train_labels.head())

train_labels: (500000, 2)
                            user_id  target
0  fb858e8e0a2bec074450eaf94b627fd3       0
1  46a5f128fd569c764a92c2eaa788095e       0
2  5a74e9ac53ffb21a20cce117c0ad77ba       0
3  af735816ca19115431ae3d89518c8c91       0
4  364f0ae0a3f29a685c4fb5bae6033b9a       0

train: (321150, 5)
   request_ts                           user_id                   referer  \
0  1701011363  fb858e8e0a2bec074450eaf94b627fd3          https://9b48ee5/   
1  1700986581  46a5f128fd569c764a92c2eaa788095e          https://9b48ee5/   
2  1701011071  5a74e9ac53ffb21a20cce117c0ad77ba  https://9634fd0/1409e548   
3  1700992803  af735816ca19115431ae3d89518c8c91          https://9b48ee5/   
4  1701021666  364f0ae0a3f29a685c4fb5bae6033b9a          https://9b48ee5/   

   geo_id                                         user_agent  
0    4799  {'browser': 'Chrome Mobile', 'browser_version'...  
1    8257  {'browser': 'Chrome Mobile', 'browser_version'...  
2    3150  {'browser': 'Yandex Browser

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# 1. Улучшенная обработка user_agent
def process_user_agent(df):
    # Развертываем словарь в отдельные колонки
    df_ua = pd.json_normalize(df['user_agent'])
    df = pd.concat([df.drop('user_agent', axis=1), df_ua], axis=1)

    # Гипотеза: больше женщин на iOS
    df['is_ios'] = df['os'].apply(lambda x: 1 if x == 'iOS' else 0)

    # Упрощаем версии ОС (с обработкой ошибок)
    def safe_version_convert(version):
        if pd.isna(version):
            return -1
        parts = str(version).split('.')
        if parts and parts[0].isdigit():
            return int(parts[0])
        return -1

    df['os_version_major'] = df['os_version'].apply(safe_version_convert)

    # Бинаризация популярных браузеров
    browsers = ['Safari', 'Chrome', 'Firefox', 'Edge', 'Opera', 'Yandex Browser']
    for browser in browsers:
        df[f'browser_{browser.lower().replace(" ", "_")}'] = df['browser'].apply(
            lambda x: 1 if x == browser else 0
        )

    return df

# 2. Обработка времени (без изменений)
def process_timestamp(df):
    df['datetime'] = pd.to_datetime(df['request_ts'], unit='s')
    df['hour'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.dayofweek
    df['is_night'] = df['hour'].apply(lambda x: 1 if x < 6 else 0)
    df['is_morning'] = df['hour'].apply(lambda x: 1 if 6 <= x < 12 else 0)
    df['is_day'] = df['hour'].apply(lambda x: 1 if 12 <= x < 18 else 0)
    df['is_evening'] = df['hour'].apply(lambda x: 1 if x >= 18 else 0)
    df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    return df.drop('datetime', axis=1)

# 3. Обработка referer (с обработкой ошибок)
def process_referer(df):
    def safe_url_split(url):
        try:
            parts = url.split('/')
            domain = parts[2] if len(parts) > 2 else ''
            path = '/'.join(parts[3:]) if len(parts) > 3 else ''
            return domain, path
        except:
            return '', ''

    df[['referer_domain', 'referer_path']] = df['referer'].apply(
        lambda x: pd.Series(safe_url_split(x)))

    df['url_complexity'] = df['referer_path'].apply(
        lambda x: len(str(x).split('/')) if x else 0)

    return df

# 4. Улучшенная обработка geo_info
def process_geo_info(geo_df):
    # Обработка экспоненциальных значений
    def convert_exp_value(x):
        try:
            if 'E+' in str(x):
                return str(int(float(x)))
            return str(x)
        except:
            return 'unknown'

    geo_df['region_id'] = geo_df['region_id'].apply(convert_exp_value)
    geo_df['region_id'] = geo_df['region_id'].fillna('unknown')
    geo_df['timezone_id'] = geo_df['timezone_id'].fillna('unknown')

    return geo_df

# Применяем обработку
try:
    print("Обработка user_agent...")
    train = process_user_agent(train)
    test = process_user_agent(test)

    print("Обработка timestamp...")
    train = process_timestamp(train)
    test = process_timestamp(test)

    print("Обработка referer...")
    train = process_referer(train)
    test = process_referer(test)

    print("Обработка geo_info...")
    geo_info = process_geo_info(geo_info)

    # 5. Объединение данных с обработкой возможных пропусков
    print("Объединение данных...")
    train = train.merge(geo_info, on='geo_id', how='left')
    test = test.merge(geo_info, on='geo_id', how='left')

    train = train.merge(referer_vectors, on='referer', how='left')
    test = test.merge(referer_vectors, on='referer', how='left')

    # Заполнение пропущенных значений
    for i in range(10):
        comp_col = f'component{i}'
        train[comp_col] = train[comp_col].fillna(train[comp_col].median())
        test[comp_col] = test[comp_col].fillna(test[comp_col].median())

    print("✅ Предобработка завершена успешно!")
    print(f"Train колонки: {train.columns.tolist()}")

except Exception as e:
    print(f"❌ Ошибка при обработке: {e}")
    import traceback
    traceback.print_exc()

Обработка user_agent...
Обработка timestamp...
Обработка referer...
Обработка geo_info...
Объединение данных...
❌ Ошибка при обработке: You are trying to merge on int64 and object columns for key 'geo_id'. If you wish to proceed you should use pd.concat


Traceback (most recent call last):
  File "/tmp/ipython-input-3897611359.py", line 101, in <cell line: 0>
    train = train.merge(geo_info, on='geo_id', how='left')
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pandas/core/frame.py", line 10832, in merge
    return merge(
           ^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pandas/core/reshape/merge.py", line 170, in merge
    op = _MergeOperation(
         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pandas/core/reshape/merge.py", line 807, in __init__
    self._maybe_coerce_merge_keys()
  File "/usr/local/lib/python3.11/dist-packages/pandas/core/reshape/merge.py", line 1508, in _maybe_coerce_merge_keys
    raise ValueError(msg)
ValueError: You are trying to merge on int64 and object columns for key 'geo_id'. If you wish to proceed you should use pd.concat


In [None]:
# Исправляем только объединение по geo_id
print("Объединение данных...")

# Преобразуем geo_id в geo_info к числовому типу (с обработкой ошибок)
geo_info['geo_id'] = pd.to_numeric(geo_info['geo_id'], errors='coerce')

# Объединяем данные
train = train.merge(geo_info, on='geo_id', how='left')
test = test.merge(geo_info, on='geo_id', how='left')

# Объединяем с векторными представлениями URL
train = train.merge(referer_vectors, on='referer', how='left')
test = test.merge(referer_vectors, on='referer', how='left')

# Заполнение пропущенных значений для компонентов
for i in range(10):
    comp_col = f'component{i}'
    train[comp_col] = train[comp_col].fillna(train[comp_col].median())
    test[comp_col] = test[comp_col].fillna(test[comp_col].median())

print("✅ Объединение данных выполнено успешно!")

Объединение данных...
✅ Объединение данных выполнено успешно!


In [None]:
def aggregate_user_features(df):
    # Группируем по пользователю
    grouped = df.groupby('user_id')

    # Основные агрегации
    agg_df = grouped.agg({
        # Гипотеза: пользователи iOS чаще женщины
        'is_ios': ['mean', 'sum'],

        # Признаки из user_agent
        'browser': ['nunique', lambda x: x.mode()[0] if not x.mode().empty else 'unknown'],
        'os': [lambda x: x.mode()[0] if not x.mode().empty else 'unknown'],
        'os_version_major': 'mean',

        # Временные признаки
        'hour': ['mean', 'std'],
        'is_night': 'mean',
        'is_morning': 'mean',
        'is_day': 'mean',
        'is_evening': 'mean',
        'is_weekend': 'mean',

        # Гео-признаки
        'country_id': [lambda x: x.mode()[0] if not x.mode().empty else 'unknown'],
        'region_id': ['nunique'],
        'timezone_id': ['nunique'],

        # Признаки из referer
        'referer_domain': ['nunique'],
        'url_complexity': 'mean',

        # Векторные компоненты
        **{f'component{i}': 'mean' for i in range(10)}
    })

    # Упрощение названий колонок
    agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]

    # Переименование ключевых признаков
    agg_df = agg_df.rename(columns={
        'is_ios_mean': 'ios_ratio',
        'is_ios_sum': 'ios_count',
        'browser_<lambda_0>': 'main_browser',
        'os_<lambda_0>': 'main_os',
        'country_id_<lambda_0>': 'main_country',
        'hour_mean': 'avg_hour',
        'hour_std': 'std_hour_activity',
        'is_weekend_mean': 'weekend_ratio'
    })

    # Дополнительные признаки
    agg_df['activity_variability'] = 1 / (1 + agg_df['std_hour_activity'].fillna(0))

    return agg_df.reset_index()

print("Агрегация train...")
train_agg = aggregate_user_features(train)
print("Агрегация test...")
test_agg = aggregate_user_features(test)

print(f"Размер train_agg: {train_agg.shape}, test_agg: {test_agg.shape}")

Агрегация train...
Агрегация test...
Размер train_agg: (291642, 30), test_agg: (131998, 30)


In [None]:
# Объединяем с целевой переменной
train_final = train_agg.merge(train_labels, on='user_id')

# Проверяем наличие категориальных признаков
categorical_cols = []
for col in ['main_browser', 'main_os', 'main_country']:
    if col in train_final.columns:
        categorical_cols.append(col)

# Обрабатываем только существующие категориальные признаки
if categorical_cols:
    for col in categorical_cols:
        # Объединяем train и test для согласованного кодирования
        combined = pd.concat([train_final[col], test_agg[col]])
        encoded = pd.factorize(combined)[0]
        train_final[col] = encoded[:len(train_final)]
        test_agg[col] = encoded[len(train_final):]

# Заполняем пропуски
for df in [train_final, test_agg]:
    for col in df.columns:
        if df[col].dtype in ['float64', 'int64']:
            df[col].fillna(df[col].median(), inplace=True)
        elif df[col].dtype == 'object':
            df[col].fillna('unknown', inplace=True)

print("✅ Финальные датасеты готовы!")
print(f"Train final: {train_final.shape}, Test agg: {test_agg.shape}")
print(f"Колонки в train_final: {train_final.columns.tolist()}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


✅ Финальные датасеты готовы!
Train final: (291642, 31), Test agg: (131998, 30)
Колонки в train_final: ['user_id', 'ios_ratio', 'ios_count', 'browser_nunique', 'main_browser', 'os_<lambda>', 'os_version_major_mean', 'avg_hour', 'std_hour_activity', 'is_night_mean', 'is_morning_mean', 'is_day_mean', 'is_evening_mean', 'weekend_ratio', 'country_id_<lambda>', 'region_id_nunique', 'timezone_id_nunique', 'referer_domain_nunique', 'url_complexity_mean', 'component0_mean', 'component1_mean', 'component2_mean', 'component3_mean', 'component4_mean', 'component5_mean', 'component6_mean', 'component7_mean', 'component8_mean', 'component9_mean', 'activity_variability', 'target']


In [None]:
# Переименовываем столбцы для удобства
train_final = train_final.rename(columns={
    'os_<lambda>': 'main_os',
    'country_id_<lambda>': 'main_country'
})

test_agg = test_agg.rename(columns={
    'os_<lambda>': 'main_os',
    'country_id_<lambda>': 'main_country'
})

# Преобразуем категориальные признаки в числовые коды
categorical_cols = ['main_os', 'main_country', 'main_browser']

for col in categorical_cols:
    # Объединяем train и test для согласованного кодирования
    combined = pd.concat([train_final[col], test_agg[col]])
    encoded = pd.factorize(combined)[0]

    # Разделяем обратно на train и test
    train_final[col] = encoded[:len(train_final)]
    test_agg[col] = encoded[len(train_final):]

# Убедимся, что все типы данных числовые
print("\nПроверка типов данных после преобразования:")
print(train_final.dtypes.value_counts())


Проверка типов данных после преобразования:
float64    21
int64       9
object      1
Name: count, dtype: int64


In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Подготовка данных
X = train_final.drop(['user_id', 'target'], axis=1)
y = train_final['target']

# Разделение на тренировочную и валидационную выборки
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Создание датасетов LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

# Параметры модели
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 63,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'seed': 42,
    'max_depth': 7,
    'min_data_in_leaf': 100
}

# Обучение модели
model = lgb.train(params,
                 train_data,
                 num_boost_round=1000,
                 valid_sets=[val_data],
                 callbacks=[
                     lgb.early_stopping(stopping_rounds=50, verbose=True),
                     lgb.log_evaluation(period=50)
                 ])

# Прогнозирование на валидации
val_pred = model.predict(X_val)
auc_score = roc_auc_score(y_val, val_pred)
print(f"\n✅ AUC на валидации: {auc_score:.4f}")

# Важность признаков
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importance()
}).sort_values('Importance', ascending=False)

print("\nТоп-15 важных признаков:")
print(feature_importance.head(15))

# Прогнозирование для тестовых данных
test_pred = model.predict(test_agg.drop('user_id', axis=1))
test_agg['predicted_target'] = test_pred

# Формирование финального результата
submission = test_agg[['user_id', 'predicted_target']].rename(columns={'predicted_target': 'target'})
submission['target'] = (submission['target'] > 0.5).astype(int)

# Сохранение результатов
submission.to_csv('gender_prediction_submission.csv', index=False)
print("\n✅ Результаты сохранены в gender_prediction_submission.csv")

Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.858438
[100]	valid_0's auc: 0.870129
[150]	valid_0's auc: 0.87553
[200]	valid_0's auc: 0.879076
[250]	valid_0's auc: 0.881123
[300]	valid_0's auc: 0.883008
[350]	valid_0's auc: 0.884346
[400]	valid_0's auc: 0.885168
[450]	valid_0's auc: 0.885978
[500]	valid_0's auc: 0.886645
[550]	valid_0's auc: 0.887383
[600]	valid_0's auc: 0.887841
[650]	valid_0's auc: 0.888217
[700]	valid_0's auc: 0.888589
[750]	valid_0's auc: 0.889034
[800]	valid_0's auc: 0.88919
[850]	valid_0's auc: 0.889566
[900]	valid_0's auc: 0.889875
[950]	valid_0's auc: 0.890115
[1000]	valid_0's auc: 0.890404
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.890404

✅ AUC на валидации: 0.8904

Топ-15 важных признаков:
                  Feature  Importance
20        component2_mean        4293
21        component3_mean        4011
26        component8_mean        3829
19        component1_mean        3823
24        componen

In [None]:
import joblib
from lightgbm import LGBMClassifier

# 1. Пересоздадим модель с лучшими параметрами на всех данных
# (используем параметры из лучшей итерации)
best_params = model.params
final_model = LGBMClassifier(**best_params)
final_model.fit(X, y)  # Обучаем на всех данных

# 2. Сохраняем модель в формате .joblib
model_filename = 'gender_prediction_model.joblib'
joblib.dump(final_model, model_filename)
print(f"✅ Модель сохранена как {model_filename}")

# 3. Сохраняем список признаков (важно для воспроизводимости)
import json
feature_names = list(X.columns)
with open('feature_names.json', 'w') as f:
    json.dump(feature_names, f)
print(f"✅ Список признаков сохранен как feature_names.json")

✅ Модель сохранена как gender_prediction_model.joblib
✅ Список признаков сохранен как feature_names.json


In [None]:
!zip -r vk_gender_prediction_solution.zip \
    gender_prediction_model.joblib \
    feature_names.json \
    gender_prediction_submission.csv \
    VK.ipynb

  adding: gender_prediction_model.joblib (deflated 59%)
  adding: feature_names.json (deflated 62%)
  adding: gender_prediction_submission.csv (deflated 46%)
