In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from datetime import datetime
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
train_df1 = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df1.head()

In [20]:
def calculate_age_at_trans(birthdate_str, trans_date_str):
    birthdate = datetime.strptime(birthdate_str, "%d/%m/%Y")
    trans_date = datetime.strptime(trans_date_str.split()[0], "%d/%m/%Y")  # 假设日期格式是“dd/mm/YYYY”并且时间是以空格分隔的
    age = trans_date.year - birthdate.year - ((trans_date.month, trans_date.day) < (birthdate.month, birthdate.day))
    return age

# 在train_df中添加年龄列
train_df1['age_at_trans'] = train_df1.apply(lambda row: calculate_age_at_trans(row['dob'], row['trans_date_trans_time']), axis=1)
category_avg_amt = train_df1.groupby('category')['amt'].mean().reset_index(name='avg_amt')

# 将平均交易金额的数据合并回原始数据集
train_df1 = train_df1.merge(category_avg_amt, on='category', how='left')

# 计算差值并将其作为新列添加到数据集中
train_df1['amt_diff_from_category_avg'] = train_df1['amt'] - train_df1['avg_amt']

# 查看更新后的数据集前几行
train_df1.head()


In [None]:
job_avg_amt = train_df1.groupby('job')['amt'].mean().reset_index(name='job_avg_amt')

# 将平均交易金额的数据合并回原始数据集
train_df1 = train_df1.merge(job_avg_amt, on='job', how='left')

# 计算差值并将其作为新列添加到数据集中
train_df1['amt_diff_from_job_avg'] = train_df1['amt'] - train_df1['job_avg_amt']
train_df1.head()

In [26]:
train_df1['distance'] = np.sqrt((train_df1['merch_lat'] - train_df1['lat'])**2 + (train_df1['merch_long'] - train_df1['long'])**2)
# 查看更新后的数据集前几行
train_df1['category_code'] = pd.factorize(train_df1['category'])[0]

# 对'merchant'列进行编号
train_df1['merchant_code'] = pd.factorize(train_df1['merchant'])[0]

# 对'job'列进行编号
train_df1['job_code'] = pd.factorize(train_df1['job'])[0]
train_df1['gender_code'] = pd.factorize(train_df1['gender'])[0]
train_df1['city_category'] = pd.qcut(train_df['city_pop'], 5, labels=[1, 2, 3, 4, 5])


In [None]:

# 从train_df中分离出包含is_fraud非空的数据用于训练和验证
df_train = train_df1.dropna(subset=['is_fraud'])

# 选择数值型特征列，除了目标变量'is_fraud'
features = df_train.select_dtypes(include=[np.number]).columns.drop('is_fraud')
specified_columns = ['unix_time','amt','zip','age_at_trans','lat','long','merch_lat','merch_long','city_category','category_code','merchant_code','job_code','cc_num','job_avg_amt','amt_diff_from_category_avg'] 
# 定义特征变量和目标变量
X = df_train[specified_columns]
y = df_train['is_fraud']

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 应用标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)



In [None]:
pip install xgboost

In [None]:

# 决策树模型
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)
y_pred_dt = dt.predict(X_val_scaled)
print("Decision Tree Accuracy:", f1_score(y_val, y_pred_dt))



# 注意：由于这是一个示例，实际的模型参数应根据问题的性质进行调整。


In [None]:
import xgboost as xgb
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')  # 如果是回归问题，使用XGBRegressor
model.fit(X_train_scaled, y_train)



# 评估模型
y_pred_dt = model.predict(X_val_scaled)
print("XGBC:", f1_score(y_val, y_pred_dt))


In [None]:
from sklearn.model_selection import GridSearchCV
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# 定义要测试的参数网格
param_grid = {
    'n_estimators': [100, 200, 300],  # 树的数量
    'learning_rate': [0.01, 0.05, 0.1],  # 学习率
    'max_depth': [3, 4, 5],  # 树的深度
    'colsample_bytree': [0.7, 0.8],  # 每棵树随机采样的比例
    'subsample': [0.7, 0.8]  # 建立树时对样本的随机采样比例
}

# 创建GridSearchCV对象
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           scoring='f1',  # 评估指标
                           cv=5,  # 交叉验证折数
                           verbose=1,  # 显示训练过程
                           n_jobs=-1)  # 使用所有CPU核心

# 执行网格搜索
grid_search.fit(X_train_scaled, y_train)

# 输出最佳参数和对应的准确率
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

In [None]:
test_features = train_df1.loc[train_df['Id'].isin(test_df['Id'])]
X_test = test_features.select_dtypes(include=[np.number]).drop(columns=['is_fraud'])

# 标准化测试数据
X_test_scaled = scaler.transform(X_test)

# 进行预测
test_predictions = dt.predict(X_test_scaled)

# 将预测结果添加到test_df DataFrame
test_df['is_fraud_pred'] = test_predictions

# 保存结果到CSV文件
test_df[['Id', 'is_fraud_pred']].to_csv('prediction_results.csv', index=False)