In [4]:
import pandas as pd

# Load the uploaded CSV file
file_path = 'C:/Users/86158/Desktop/GSE139061_Eadon_processed_QN_101419.csv'
data = pd.read_csv(file_path)

# Display the first few rows and the shape of the dataset
data_shape = data.shape
data_head = data.head()
data_shape, data_head


((20139, 49),
       Geneid          REF1          REF2           REF3          REF4  \
 0     MALAT1  1.019521e+06  1.019521e+06  259236.666700  1.019521e+06   
 1  LINC01000  2.237625e+03  7.162917e+02    2258.291667  7.108125e+02   
 2    SNORD3D  5.742500e+02  1.479167e+01     929.916667  2.975000e+01   
 3     ABLIM1  2.117812e+03  2.055292e+03    1648.541667  2.245354e+03   
 4    SNORD22  2.872917e+02  1.347917e+01     124.104167  2.845833e+01   
 
            REF5          REF6          REF7          REF8          REF9  ...  \
 0  1.019521e+06  1.019521e+06  1.019521e+06  1.019521e+06  1.019521e+06  ...   
 1  3.303021e+03  3.683333e+02  3.298750e+02  2.118750e+02  4.398333e+02  ...   
 2  5.217917e+02  5.020833e+01  4.343750e+01  2.618750e+01  1.830417e+02  ...   
 3  1.885333e+03  2.876208e+03  2.187729e+03  2.034083e+03  1.814542e+03  ...   
 4  5.050417e+02  1.854167e+01  7.133333e+01  1.098750e+02  2.438542e+02  ...   
 
           AKI30         AKI31         AKI32        

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# 加载数据
file_path = 'C:/Users/86158/Desktop/GSE139061_Eadon_processed_QN_101419.csv'
data = pd.read_csv(file_path)

# 定义标签
ref_columns = [col for col in data.columns if col.startswith('REF')]
aki_columns = [col for col in data.columns if col.startswith('AKI')]

# 数据平衡和标准化设置
smote = SMOTE(sampling_strategy='auto', random_state=42)
scaler = StandardScaler()

# 定义一个函数来进行数据处理、训练模型、计算最优阈值并绘制混淆矩阵
def process_gene_data(gene):
    ref_expression = data[data['Geneid'] == gene][ref_columns].values.flatten()
    aki_expression = data[data['Geneid'] == gene][aki_columns].values.flatten()
    gene_expression = np.concatenate([ref_expression, aki_expression]).reshape(-1, 1)
    labels = np.array([0] * len(ref_expression) + [1] * len(aki_expression))

    # 数据标准化
    gene_expression_scaled = scaler.fit_transform(gene_expression)

    # 使用SMOTE进行数据平衡
    X_res, y_res = smote.fit_resample(gene_expression_scaled, labels)

    # 分割数据集
    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

    # 训练逻辑回归模型
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # 预测概率
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # 计算最优阈值（选择F1分数最高的阈值）
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores = np.zeros_like(precision)  # 初始化F1分数数组
    for i in range(len(precision)):
        if precision[i] + recall[i] > 0:  # 避免除以零
            f1_scores[i] = 2 * (precision[i] * recall[i]) / (precision[i] + recall[i])
        else:
            f1_scores[i] = 0  # 如果precision+recall为0，设置F1分数为0

    optimal_threshold = thresholds[np.argmax(f1_scores)]

    # 使用最优阈值计算最终的预测结果
    y_pred = (y_pred_proba >= optimal_threshold).astype(int)

    # 计算F1分数
    f1 = f1_score(y_test, y_pred)

    # 绘制混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Control', 'AKI'])
    disp.plot(cmap='Blues')
    plt.title(f"Confusion Matrix for Gene: {gene}")
    plt.close()  # 关闭图形，避免过多图形积累

    return f1

# 初始化一个字典来保存每个基因的F1值
gene_f1_scores = {}

# 遍历每个基因
for gene in data['Geneid'].unique():
    f1 = process_gene_data(gene)
    gene_f1_scores[gene] = f1

# 按F1值降序排序基因
sorted_genes = sorted(gene_f1_scores.items(), key=lambda x: x[1], reverse=True)

# 打印前10个F1值最高的基因
top_10_genes = sorted_genes[:10]
print("Top 10 genes with highest F1 scores:")
for gene, f1 in top_10_genes:
    print(f"Gene: {gene}, F1: {f1:.4f}")

# 显示混淆矩阵（所有图形已经关闭）
plt.show()


Top 10 genes with highest F1 scores:
Gene: MDM4, F1: 1.0000
Gene: SNORD3C, F1: 1.0000
Gene: SCARNA22, F1: 1.0000
Gene: PLEKHN1, F1: 1.0000
Gene: UPF2, F1: 1.0000
Gene: TEK, F1: 1.0000
Gene: FLI1, F1: 1.0000
Gene: PHF14, F1: 1.0000
Gene: MAP3K7, F1: 1.0000
Gene: FARP1, F1: 1.0000
