## 1 - 加载数据集并提取特征

In [8]:
import numpy as np
import pandas as pd
from src.data.load import load_dataset
from src.data.process import extract_targets
from src.data.features import FeatureExtraction

data = load_dataset()

FeatureExtraction.extract_user_feature(data['all_train'], data['all_uid'])
features_train = FeatureExtraction.extract_features(data['all_train'])
features_valid = FeatureExtraction.extract_features(data['valid'])
features_test = FeatureExtraction.extract_features(data['test'])
print('feature dim:', features_train.shape[1])
targets_train = extract_targets(data['all_train'], 'log')
targets_valid = extract_targets(data['valid'], 'linear')

Counting User Histories: 100%|██████████| 1229618/1229618 [00:31<00:00, 38639.14it/s]


feature dim: 67


## 2 - 超参数搜索 & 模型拟合

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from scipy.stats import randint, uniform

# 构建模型
model = xgb.XGBRegressor()

# 定义参数分布s
param_dist = {
    'max_depth': randint(3, 16),
    'learning_rate': uniform(0.01, 0.1),
    'n_estimators': randint(80, 200),
    # 'min_child_weight': randint(1, 10)
}

# 执行随机搜索
random_search = RandomizedSearchCV(
    estimator=model, 
    param_distributions=param_dist, 
    n_iter=5, 
    cv=5, 
    n_jobs=-1,
    verbose=2
    )
random_result = random_search.fit(features_train, targets_train)

# 输出最佳参数组合和得分
print("Best Parameters: ", random_result.best_params_)
print("Best Score: ", random_result.best_score_)

In [2]:

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_jobs=-1)

# 定义参数网格
param_grid = {
    'n_estimators': [50, 80, 120],
    'max_depth': [None, 10, 20],
    'min_samples_split': [10],
    'min_samples_leaf': [4],
    'max_features': ['sqrt']
}

# 执行网格搜索
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_result = grid_search.fit(features_train, targets_train)

# 输出最佳参数组合和得分
print("Best Parameters: ", grid_result.best_params_)
print("Best Score: ", grid_result.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best Parameters:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 120}
Best Score:  0.5917448449583423


In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# model = LinearRegression()
# model = RandomForestRegressor(n_estimators=20, n_jobs=-1)
# model = RandomForestRegressor(
#     n_estimators=100,
#     max_depth=30,
#     min_samples_leaf=5,
#     min_samples_split=10,
#     max_features='sqrt', 
#     n_jobs=-1)
model = xgb.XGBRegressor(
    n_estimators=128, 
    max_depth=8,
    learning_rate=0.1,
    n_jobs=-1
)
model.fit(features_train, targets_train)
predicts = model.predict(features_valid)
predicts.shape

(184937, 3)

## 验证集 & 测试集 & 提交结果

In [13]:
from src.metric import compute_metrics

# 计算指标时恢复对数
def exp_interactions(x: np.ndarray) -> np.ndarray:
    """先计算指数，然后取整"""
    return np.rint(np.exp(x) - 1).astype(int)
compute_metrics(exp_interactions(predicts), targets_valid)

{'score': 0.3709332048892975}

In [14]:
test_set_predicts = exp_interactions(model.predict(features_test))
results = []
for i in range(len(data['test'])):
    results.append(
        "{}\t{}\t{},{},{}\n".format(
            data['test'].loc[i, 'uid'],
            data['test'].loc[i, 'mid'],
            test_set_predicts[i, 1],
            test_set_predicts[i, 2],
            test_set_predicts[i, 0]
        )
    )
with open("submission.txt", 'w') as f:
    f.writelines(results)