In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from datetime import datetime
import numpy as np
train_df1 = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [20]:
def calculate_age_at_trans(birthdate_str, trans_date_str):
    birthdate = datetime.strptime(birthdate_str, "%d/%m/%Y")
    trans_date = datetime.strptime(trans_date_str.split()[0], "%d/%m/%Y")  # 假设日期格式是“dd/mm/YYYY”并且时间是以空格分隔的
    age = trans_date.year - birthdate.year - ((trans_date.month, trans_date.day) < (birthdate.month, birthdate.day))
    return age

# 在train_df中添加年龄列
train_df1['age_at_trans'] = train_df1.apply(lambda row: calculate_age_at_trans(row['dob'], row['trans_date_trans_time']), axis=1)


In [None]:

train_df1['city_category'] = pd.qcut(train_df['city_pop'], 5, labels=[1, 2, 3, 4, 5])

train_df1.head(10)
# 从train_df中分离出包含is_fraud非空的数据用于训练和验证
df_train = train_df1.dropna(subset=['is_fraud'])

# 选择数值型特征列，除了目标变量'is_fraud'
features = df_train.select_dtypes(include=[np.number]).columns.drop('is_fraud')

# 定义特征变量和目标变量
X = df_train[features]
y = df_train['is_fraud']

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 应用标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# KNN模型
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_val_scaled)
print("KNN Accuracy:", accuracy_score(y_val, y_pred_knn))

# 决策树模型
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)
y_pred_dt = dt.predict(X_val_scaled)
print("Decision Tree Accuracy:", accuracy_score(y_val, y_pred_dt))



# 注意：由于这是一个示例，实际的模型参数应根据问题的性质进行调整。


In [26]:
test_features = train_df1.loc[train_df['Id'].isin(test_df['Id'])]
X_test = test_features.select_dtypes(include=[np.number]).drop(columns=['is_fraud'])

# 标准化测试数据
X_test_scaled = scaler.transform(X_test)

# 进行预测
test_predictions = dt.predict(X_test_scaled)

# 将预测结果添加到test_df DataFrame
test_df['is_fraud_pred'] = test_predictions

# 保存结果到CSV文件
test_df[['Id', 'is_fraud_pred']].to_csv('prediction_results.csv', index=False)