In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score

# 读取数据
trade_df = pd.read_csv('./trade.csv')
train_df = pd.read_csv('./user_train.csv')
test_df = pd.read_csv('./user_test.csv')

from sklearn.preprocessing import MinMaxScaler, StandardScaler

'''
# 使用 Min-Max 标准化
min_max_scaler = MinMaxScaler()
trade_df['time_normalized'] = min_max_scaler.fit_transform(trade_df[['time']])
trade_df

'''
# 使用 Z-score 标准化
z_score_scaler = StandardScaler()
trade_df['time_normalized'] = z_score_scaler.fit_transform(trade_df[['time']])

def trade_data_processing(trade_df):
    trade_count = trade_df.groupby('bidder_id').size().reset_index(name='trade_count')
    trade_count = trade_count.reset_index(drop=True)

    device_counts = trade_df.groupby(['bidder_id', 'device']).size()
    device_counts = device_counts.reset_index(name='device_counts')
    most_used_devices = device_counts.loc[device_counts.groupby('bidder_id')['device_counts'].idxmax()]
    most_used_devices = most_used_devices.rename(columns={"device": "most_used_device"}).reset_index(drop=True)

    country_counts = trade_df.groupby(['bidder_id', 'country']).size()
    country_counts = country_counts.reset_index(name='country_counts')
    most_used_countries = country_counts.loc[country_counts.groupby('bidder_id')['country_counts'].idxmax()]
    most_used_countries = most_used_countries.rename(columns={"country": "most_used_country"}).reset_index(drop=True)

    merchandise_counts = trade_df.groupby(['bidder_id', 'merchandise']).size()
    merchandise_counts = merchandise_counts.reset_index(name='merchandise_counts')
    most_merchandise = merchandise_counts.loc[merchandise_counts.groupby('bidder_id')['merchandise_counts'].idxmax()]
    most_merchandise = most_merchandise.rename(columns={"merchandise":"most_merchandise"}).reset_index(drop=True)
    
    ip_counts = trade_df.groupby(['bidder_id', 'ip']).size()
    ip_counts = ip_counts.reset_index(name='ip_counts')
    most_used_ip = ip_counts.loc[ip_counts.groupby('bidder_id')['ip_counts'].idxmax()]
    most_used_ip = most_used_ip.rename(columns={"ip":"most_used_ip"}).reset_index(drop=True)
     
    url_counts = trade_df.groupby(['bidder_id', 'url']).size()
    url_counts = url_counts.reset_index(name='url_counts')
    most_used_url = url_counts.loc[url_counts.groupby('bidder_id')['url_counts'].idxmax()]
    most_used_url = most_used_url.rename(columns={"url":"most_used_url"}).reset_index(drop=True)

    average_times = trade_df.groupby('bidder_id')['time_normalized'].mean()
    average_times = average_times.reset_index(name='average_times')

    feature_df = trade_count.merge(most_used_devices, on='bidder_id', how='left').reset_index(drop=True)
    feature_df = feature_df.merge(most_used_countries, on='bidder_id', how='left').reset_index(drop=True)
    feature_df = feature_df.merge(average_times, on='bidder_id', how='left').reset_index(drop=True)
    feature_df = feature_df.merge(most_merchandise, on='bidder_id', how='left').reset_index(drop=True)
    feature_df = feature_df.merge(most_used_ip, on='bidder_id', how='left').reset_index(drop=True)
    feature_df = feature_df.merge(most_used_url, on='bidder_id', how='left').reset_index(drop=True)

    return feature_df

user_feature_df = trade_data_processing(trade_df)
train_df = train_df.merge(user_feature_df, on='bidder_id', how='left')
test_df = test_df.merge(user_feature_df, on='bidder_id', how='left')

def fillNAN(df):
    num_columns = df.select_dtypes(include=["number", "bool"]).columns
    cat_columns = df.select_dtypes(exclude=["number", "bool"]).columns

    for col in num_columns:
        mean_value = df[col].mean()
        df[col].fillna(mean_value, inplace=True)

    for col in cat_columns:
        mode_value = df[col].mode()[0]
        df[col].fillna(mode_value, inplace=True)

    return df

train_df = fillNAN(train_df)
test_df = fillNAN(test_df)

def remove_outliers_box(df):
    cleaned_df = df.copy()  

    # 处理连续型特征
    #continuous_features = df.drop(columns=['outcome']).select_dtypes(include=['number']).columns
    连续型特征 = df.drop(columns=['outcome']).select_dtypes(include=['number']).columns
    for column in 连续型特征:
        # 使用箱线图计算上下边缘
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 3 * iqr
        upper = q3 + 3 * iqr

        # 去除异常值
        df[column] = np.where(df[column] > upper, upper, df[column])
        df[column] = np.where(df[column] < lower, lower, df[column])

    # 处理标称特征（如果需要的话）

    return cleaned_df
train_df = remove_outliers_box(train_df)

# 存储卡方检验结果的字典
chi2_results = {}

# 选择标称属性列
categorical_columns = ['most_used_device', 'most_used_country', 'most_merchandise','most_used_ip','most_used_url']

# 对每个标称属性进行卡方检验
for column in categorical_columns:  # 忽略最后一列标签列
    contingency_table = pd.crosstab(train_df[column], train_df['outcome'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    chi2_results[column] = p

# 将结果转换为 DataFrame 并按 p 值排序
chi2_df = pd.DataFrame(list(chi2_results.items()), columns=['Feature', 'p_value'])
chi2_df = chi2_df.sort_values(by='p_value')

# 输出结果
print("卡方检验结果 (按 p 值排序):")
for index, row in chi2_df.iterrows():
    print(f"{row['Feature']} 与标签的p值: {row['p_value']}")


# 判断并输出有显著相关性的特征
alpha = 0.05
significant_features = [feature for feature, p_value in chi2_results.items() if p_value < alpha]

if significant_features:
    print("\n与标签有显著相关性的标称属性有：")
    for feature in significant_features:
        print(feature)
else:
    print("\n没有标称属性与标签之间有显著的相关性。")

import seaborn as sns
import matplotlib.pyplot as plt
# 选择数值型和布尔型数据
num_features = train_df.select_dtypes(include=["number", "bool"]).copy()
num_features = num_features.apply(lambda x: int(x) if isinstance(x, bool) else x)
# 计算相关系数矩阵
corrmat = num_features.corr()

# 选择与目标变量 'outcome' 相关性最大的5个属性
k = 8
cols = corrmat.nlargest(k + 1, 'outcome')['outcome'].index  # 加1是因为包括了目标变量本身
cm = np.corrcoef(train_df[cols].values.T)

# 画出heatmap
sns.set_context("notebook", font_scale=0.75)
plt.figure(figsize=(6, 4))
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f',
                 annot_kws={'size': 8}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

# 打印相关性最大的5个属性
print("与标签 'outcome' 相关性最大的8个属性：")
print(cols[1:])

# 合并训练集和测试集
combined_df = pd.concat([train_df, test_df], keys=['train', 'test'])
# 进行独热编码
# 'most_used_device', 'most_used_ip','most_used_country','most_merchandise','most_used_url'
categorical_features = ['most_used_country','most_used_url']
combined_encoded = pd.get_dummies(combined_df, columns=categorical_features)

# 拆分回训练集和测试集
train_encoded = combined_encoded.xs('train')
test_encoded = combined_encoded.xs('test')
train_encoded


for column in train_df.columns:
    if train_df[column].dtype == bool:
        train_df[column] = train_df[column].astype(np.float64)
for column in test_df.columns:
    if test_df[column].dtype == bool:
        test_df[column] = test_df[column].astype(np.float64)

#删去非数值属性的列用于训练
columns1 = train_encoded.select_dtypes(exclude=["number", "bool"]).columns
train = train_encoded.drop(columns=columns1)
test = test_encoded.drop(columns=columns1)
test = test.drop(columns=['outcome'])

from sklearn.model_selection import GridSearchCV

# 定义要调优的参数网格
param_grid1 = {
    'n_estimators': [50, 100, 150],
    'max_features': [2, 3, 4],
    'criterion': ['gini', 'entropy']
}

# 创建随机森林分类器
clf1 = RandomForestClassifier()

# 实例化 GridSearchCV 对象
grid_search1 = GridSearchCV(clf1, param_grid1, cv=5)

# 使用网格搜索对模型进行参数调优
grid_search1.fit(X, Y)

# 输出最佳参数组合和对应的评分
print("最佳参数组合: ", grid_search1.best_params_)
print("最佳准确率: ", grid_search1.best_score_)

# 使用最佳参数训练模型并进行评估
best_clf1 = grid_search1.best_estimator_
scores = cross_val_score(best_clf1, X, Y, cv=5)
print('调优后RF准确率：', scores.mean())

# 将train_df划分为数据集和训练集来训练模型
train_df_train, train_df_test = train_test_split(train, test_size=0.3, random_state=0)
y_real = train_df_test['outcome']
X_train_df_test = train_df_test.drop(columns=['outcome']).select_dtypes(include=["number", "bool"])
selected_columns_df = X_train_df_test.loc[:, ['trade_count','average_times']]
# 预测结果
y_pred1 = best_clf1.predict(X_train_df_test)
# 计算AUC
auc1 = roc_auc_score(y_real, y_pred1)
print("Valid aUC1: ", auc1)
# 预测test_df数据集
X_test = test.select_dtypes(include=["number", "bool"])
y_test1 = best_clf1.predict_proba(X_test)[:, 1]


# 保存预测结果
test_df['prediction'] = y_test1
test_df[['prediction']].to_csv('result.csv', index=False, header=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mean_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always beha

ValueError: could not convert string to float: 'aee383561c0019dc01552bfa5263af8etbuc7'