In [1]:
import os
import pandas as pd

In [6]:
import pandas as pd


test_data_file = "/playpen/jesse/drug_repurpose/split_data/data_analysis/test_data_new.csv"
test_data = pd.read_csv(test_data_file)

num_unique_drugs = test_data['drug_index'].nunique()
num_unique_diseases = test_data['disease_index'].nunique()
print(f"Unique drugs: {num_unique_drugs}")
print(f"Unique diseases: {num_unique_diseases}")

drug_counts = test_data['drug_index'].value_counts()
print("\nTop 10 most frequent drugs:")
print(drug_counts.head(10))

disease_counts = test_data['disease_index'].value_counts()
print("\nTop 10 most frequent diseases:")
print(disease_counts.head(10))

drug_to_diseases = test_data.groupby('drug_index')['disease_index'].nunique().sort_values(ascending=False)
print("\nTop 10 drugs by number of distinct diseases:")
print(drug_to_diseases.head(10))

disease_to_drugs = test_data.groupby('disease_index')['drug_index'].nunique().sort_values(ascending=False)
print("\nTop 10 diseases by number of distinct drugs:")
print(disease_to_drugs.head(10))


Unique drugs: 627
Unique diseases: 297

Top 10 most frequent drugs:
drug_index
14023    14
14019    12
14028    11
14024     9
15003     9
14269     9
16490     7
14275     6
14898     6
14030     5
Name: count, dtype: int64

Top 10 most frequent diseases:
disease_index
33577    31
33575    29
29113    22
28158    16
28208    15
33675    14
29078    14
37888    13
27933    12
30382    12
Name: count, dtype: int64

Top 10 drugs by number of distinct diseases:
drug_index
14023    14
14019    12
14028    11
14269     9
15003     9
14024     9
16490     7
14275     6
14898     6
14849     5
Name: disease_index, dtype: int64

Top 10 diseases by number of distinct drugs:
disease_index
33577    31
33575    29
29113    22
28158    16
28208    15
33675    14
29078    14
37888    13
30382    12
27933    12
Name: drug_index, dtype: int64


In [None]:
import pandas as pd


test_data_file = "/playpen/jesse/drug_repurpose/split_data/data_analysis/test_data_new.csv"
train_data_file = "/playpen/jesse/drug_repurpose/grpo_part_path/page_rank/train_grpo_baseline.csv"

test_df = pd.read_csv(test_data_file)
test_drug_ids = test_df['drug_index'].unique()

train_df = pd.read_csv(train_data_file)

train_df['is_indication'] = train_df['original_relation'].map(lambda x: True if x.strip().lower() == 'indication' else False)


results = []
for drug_id in test_drug_ids:
    subset = train_df[train_df['drug_index'] == drug_id]
    if len(subset) == 0:
        results.append({
            'drug_index': drug_id,
            'n_samples': 0,
            'n_indications': 0,
            'yes_rate': float('nan')
        })
    else:
        n_samples = len(subset)
        n_indications = subset['is_indication'].sum()
        yes_rate = n_indications / n_samples
        results.append({
            'drug_index': drug_id,
            'n_samples': n_samples,
            'n_indications': int(n_indications),
            'yes_rate': yes_rate
        })


results_df = pd.DataFrame(results)


filtered_all = train_df[train_df['drug_index'].isin(test_drug_ids)]
overall_n = len(filtered_all)
overall_yes = filtered_all['is_indication'].sum()
overall_rate = overall_yes / overall_n if overall_n > 0 else float('nan')

print("Per‐drug indication rates:")
print(results_df.sort_values('drug_index').reset_index(drop=True))

print("\nOverall across all test‐set drug_ids:")
print(f"  Total training samples involving those drugs: {overall_n}")
print(f"  Number of indications (Yes): {overall_yes}")
print(f"  Overall Yes‐rate: {overall_rate:.4f}")


Per‐drug indication rates:
     drug_index  n_samples  n_indications  yes_rate
0         14017          0              0       NaN
1         14019          7              7       1.0
2         14020          0              0       NaN
3         14021          0              0       NaN
4         14023         11             11       1.0
..          ...        ...            ...       ...
622       20574          0              0       NaN
623       20596          0              0       NaN
624       20597          0              0       NaN
625       20619          0              0       NaN
626       20622          0              0       NaN

[627 rows x 4 columns]

Overall across all test‐set drug_ids:
  Total training samples involving those drugs: 1013
  Number of indications (Yes): 507
  Overall Yes‐rate: 0.5005


In [8]:
import pandas as pd


test_data_file = "/playpen/jesse/drug_repurpose/split_data/data_analysis/test_data_new.csv"
train_data_file = "/playpen/jesse/drug_repurpose/grpo_part_path/page_rank/train_grpo_baseline.csv"

test_df = pd.read_csv(test_data_file)
test_drug_ids = test_df['disease_index'].unique()

train_df = pd.read_csv(train_data_file)

train_df['is_indication'] = train_df['original_relation'].map(lambda x: True if x.strip().lower() == 'indication' else False)


results = []
for drug_id in test_drug_ids:
    subset = train_df[train_df['disease_index'] == drug_id]
    if len(subset) == 0:
        results.append({
            'disease_index': drug_id,
            'n_samples': 0,
            'n_indications': 0,
            'yes_rate': float('nan')
        })
    else:
        n_samples = len(subset)
        n_indications = subset['is_indication'].sum()
        yes_rate = n_indications / n_samples
        results.append({
            'disease_index': drug_id,
            'n_samples': n_samples,
            'n_indications': int(n_indications),
            'yes_rate': yes_rate
        })


results_df = pd.DataFrame(results)


filtered_all = train_df[train_df['disease_index'].isin(test_drug_ids)]
overall_n = len(filtered_all)
overall_yes = filtered_all['is_indication'].sum()
overall_rate = overall_yes / overall_n if overall_n > 0 else float('nan')

print("Per‐disease indication rates:")
print(results_df.sort_values('disease_index').reset_index(drop=True))

print("\nOverall across all test‐set disease_ids:")
print(f"  Total training samples involving those disease: {overall_n}")
print(f"  Number of indications (Yes): {overall_yes}")
print(f"  Overall Yes‐rate: {overall_rate:.4f}")


Per‐disease indication rates:
     disease_index  n_samples  n_indications  yes_rate
0            27285          0              0       NaN
1            27286         73              0       0.0
2            27292          0              0       NaN
3            27326          0              0       NaN
4            27361          3              3       1.0
..             ...        ...            ...       ...
292          83961          0              0       NaN
293          83996          1              1       1.0
294          84050          0              0       NaN
295          84203          0              0       NaN
296          84211          0              0       NaN

[297 rows x 4 columns]

Overall across all test‐set disease_ids:
  Total training samples involving those disease: 968
  Number of indications (Yes): 460
  Overall Yes‐rate: 0.4752


In [None]:
"""
{'$YES$': 2}
{}
{}
{}
{'$YES$': 3}
{}
{}
{'$YES$': 2}
{}
{}
{'$YES$': 11}
{}
{}
{}
{'$YES$': 3}
{'$YES$': 2}
{'$YES$': 1}
{'$YES$': 9}
{}
{'$YES$': 8}
{'$YES$': 1}
{}
{}
{}
{}
{'$YES$': 2}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{'$YES$': 3}
{}
{'$YES$': 4}
{'$YES$': 3}
{'$YES$': 2}
{}
{'$YES$': 3}
{}
{'$YES$': 3}
{'$YES$': 1}
{'$YES$': 2}
{}
{}
{}
{'$YES$': 6}
{'$YES$': 1}
{}
{}
{}
{}
{}
{}
{}
{'$YES$': 13}
{'$YES$': 37}
{'$YES$': 1}
{}
{}
{'$YES$': 1}
{'$YES$': 4}
{}
{}
{}
{'$YES$': 1}
{}
{}
{'$YES$': 1}
{}
{}
{}
{}
{}
{}
{'$YES$': 2}
{}
{'$NO$': 73}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{'$YES$': 4}
{'$YES$': 2}
{'$YES$': 2}
{}
{}
{'$YES$': 6}
{}
{}
{}
{'$YES$': 1}
{'$YES$': 1}
{}
{'$YES$': 1}
{'$YES$': 2}
{'$YES$': 3}
{}
{}
{}
{'$YES$': 1}
{}
{}
{'$YES$': 1}
{}
{}
{}
{}
{'$NO$': 16, '$YES$': 2}
{'$YES$': 1}
{}
{}
{}
{'$YES$': 2}
{}
{'$YES$': 2}
{'$YES$': 16}
{}
{}
{}
{}
{}
{'$YES$': 5, '$NO$': 3}
{'$YES$': 7}
{}
{'$YES$': 8}
{'$YES$': 1}
{'$YES$': 6}
{}
{'$YES$': 1}
{'$YES$': 7}
{}
{}
{'$YES$': 1}
{'$YES$': 1}
{'$YES$': 1}
{}
{'$NO$': 2, '$YES$': 1}
{}
{}
{}
{}
{'$YES$': 24}
{'$YES$': 1}
{}
{}
{}
{}
{}
{'$YES$': 3}
{}
{}
{}
{'$YES$': 3}
{}
{'$NO$': 71}
{'$YES$': 6}
{'$YES$': 1}
{'$NO$': 5}
{}
{'$YES$': 2}
{'$YES$': 2}
{}
{}
{}
{'$YES$': 7}
{}
{}
{}
{}
{}
{'$YES$': 1}
{}
{}
{}
{}
{'$YES$': 2}
{}
{}
{}
{'$YES$': 1}
{}
{}
{'$YES$': 14}
{}
{}
{}
{}
{'$YES$': 5}
{}
{}
{}
{}
{}
{}
{}
{'$YES$': 2}
{}
{}
{'$YES$': 5}
{'$NO$': 13}
{'$YES$': 1}
{'$YES$': 60}
{'$YES$': 7}
{}
{'$NO$': 72}
{}
{}
{'$YES$': 5}
{'$YES$': 1}
{'$YES$': 21}
{}
{}
{'$NO$': 83}
{'$YES$': 1}
{}
{'$YES$': 3}
{}
{}
{'$YES$': 24}
{'$NO$': 61}
{}
{}
{'$YES$': 2}
{}
{}
{'$NO$': 69, '$YES$': 2}
{}
{}
{}
{'$YES$': 2}
{}
{}
{'$YES$': 1}
{}
{}
{}
{}
{}
{'$YES$': 10}
{}
{}
{}
{'$YES$': 1}
{'$YES$': 1}
{}
{'$YES$': 2}
{}
{}
{}
{}
{}
{'$YES$': 1}
{}
{}
{}
{}
{}
{}
{}
{'$YES$': 1}
{}
{}
{'$YES$': 1}
{'$NO$': 16}
{'$YES$': 1}
{}
{}
{}
{}
{'$YES$': 5}
{}
{}
{'$NO$': 7}
{}
{}
{}
{}
{}
{'$NO$': 5}
{'$YES$': 14}
{}
{'$YES$': 4}
{}
{'$YES$': 1}
{'$YES$': 2}
{'$YES$': 1}
{}
"""

SyntaxError: incomplete input (969763094.py, line 1)

In [None]:
# directly RL 1ep 