<a href="https://colab.research.google.com/github/kalyani234/drug_dissertation/blob/main/Balanced_ddi_labels_parquet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import random

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define paths
input_path = '/content/drive/MyDrive/ColabNotebooks/drug/ddi_labels.csv'
output_path = '/content/drive/MyDrive/ColabNotebooks/drug/balanced_ddi_labels.csv'

# Load original ddi_labels.csv with positive interactions (label = 1)
ddi_labels = pd.read_csv(input_path)

# Check structure of the original data
print("Original ddi_labels head:\n", ddi_labels.head())

# Ensure 'drug1', 'drug2', and 'label' columns are present
assert {'drug1', 'drug2', 'label'}.issubset(ddi_labels.columns), "ddi_labels.csv must have columns: 'drug1', 'drug2', and 'label'"

# Get all unique drug IDs from the interacting pairs
all_drugs = pd.unique(ddi_labels[['drug1', 'drug2']].values.ravel())
positive_pairs = set(tuple(sorted([row['drug1'], row['drug2']])) for _, row in ddi_labels.iterrows())

# Number of negative samples to generate (equal to positive samples)
num_negative_samples = len(ddi_labels)

# Generate a pool of random pairs
negative_samples = []
while len(negative_samples) < num_negative_samples:
    # Randomly sample two drugs to create a pair
    drug1, drug2 = random.sample(list(all_drugs), 2)
    pair = tuple(sorted([drug1, drug2]))  # Sort for consistency

    # Add to negative samples only if it's not in positive pairs
    if pair not in positive_pairs:
        negative_samples.append({'drug1': drug1, 'drug2': drug2, 'label': 0})
        positive_pairs.add(pair)  # Prevent duplicate pairs

# Create DataFrame for negative samples
negative_df = pd.DataFrame(negative_samples)

# Combine positive and negative samples
balanced_ddi_labels = pd.concat([ddi_labels, negative_df], ignore_index=True)

# Verify distribution of labels in memory
print("Label distribution before saving:\n", balanced_ddi_labels['label'].value_counts())

# Save the balanced dataset explicitly
balanced_ddi_labels.to_csv(output_path, index=False)
print(f"Balanced DDI labels saved to {output_path}")

# Reload and verify after saving to confirm persistence
balanced_ddi_labels_check = pd.read_csv(output_path)
print("Label distribution after reloading from file:\n", balanced_ddi_labels_check['label'].value_counts())


Mounted at /content/drive
Original ddi_labels head:
      drug1    drug2  label
0  DB00001  DB06605      1
1  DB00001  DB06695      1
2  DB00001  DB01254      1
3  DB00001  DB01609      1
4  DB00001  DB01586      1
Label distribution before saving:
 label
1    2839610
0    2839610
Name: count, dtype: int64
Balanced DDI labels saved to /content/drive/MyDrive/ColabNotebooks/drug/balanced_ddi_labels.csv
Label distribution after reloading from file:
 label
1    2839610
0    2839610
Name: count, dtype: int64


In [2]:
import pandas as pd

# Load the full dataset from CSV
file_path = '/content/drive/MyDrive/ColabNotebooks/drug/balanced_ddi_labels.csv'
balanced_ddi_labels = pd.read_csv(file_path)

# Save the dataset in Parquet format
parquet_path = '/content/drive/MyDrive/ColabNotebooks/drug/balanced_ddi_labels.parquet'
balanced_ddi_labels.to_parquet(parquet_path, index=False)

print(f"Data saved to {parquet_path} in Parquet format.")


Data saved to /content/drive/MyDrive/ColabNotebooks/drug/balanced_ddi_labels.parquet in Parquet format.


In [3]:
import pandas as pd

# Load the Parquet file
parquet_path = '/content/drive/MyDrive/ColabNotebooks/drug/balanced_ddi_labels.parquet'
data = pd.read_parquet(parquet_path)

# Display the first few rows
print(data.head())

# Display summary statistics or specific columns if needed
print(data.describe())


     drug1    drug2  label
0  DB00001  DB06605      1
1  DB00001  DB06695      1
2  DB00001  DB01254      1
3  DB00001  DB01609      1
4  DB00001  DB01586      1
           label
count  5679220.0
mean         0.5
std          0.5
min          0.0
25%          0.0
50%          0.5
75%          1.0
max          1.0
