In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the CRISPR gene effect data
crispr_df = pd.read_csv("/Users/kairi/Desktop/BENG_182_Project/FINAL_FILTERED_CRISPR.csv")
# Load the mapping file
mapping_df = pd.read_csv("/Users/kairi/Desktop/BENG_182_Project/mapping.tsv", sep='\t')

In [None]:
# Rename columns for merging
mapping_df = mapping_df.rename(columns={'Model ID': 'ACH_ID'}) #rename to merge 
crispr_df = crispr_df.rename(columns={'Unnamed: 0': 'ACH_ID'}) #rename to merge 

In [None]:
merged_df = pd.merge(crispr_df, mapping_df[['ACH_ID', 'Classification', 'Sample Name']], on='ACH_ID')

In [None]:
# Separate the data into ecDNA present and non-present groups
ecDNA_present = merged_df[merged_df['Classification'] == 'ecDNA']
ecDNA_non_present = merged_df[merged_df['Classification'] != 'ecDNA']

In [None]:
# Select only numeric columns for melting (had an error here for numeric as I wanted to include Model ID and cell line name in the nonPresent and Present files)
numeric_columns = ecDNA_present.select_dtypes(include=['float64', 'int64']).columns

In [None]:
# Combine the two datasets for easier plotting
ecDNA_present_melted = ecDNA_present.melt(id_vars=['ACH_ID', 'Sample Name'], value_vars=numeric_columns, var_name='Gene', value_name='Score')
ecDNA_present_melted['Classification'] = 'ecDNA Present'

ecDNA_non_present_melted = ecDNA_non_present.melt(id_vars=['ACH_ID', 'Sample Name'], value_vars=numeric_columns, var_name='Gene', value_name='Score')
ecDNA_non_present_melted['Classification'] = 'ecDNA Not Present'

# Save to CSV
ecDNA_non_present_melted.to_csv("/Users/kairi/Desktop/BENG_182_Project/nonPresent.csv", index=False)
ecDNA_present_melted.to_csv("/Users/kairi/Desktop/BENG_182_Project/Present.csv", index=False)


In [None]:
combined_df = pd.concat([ecDNA_present_melted, ecDNA_non_present_melted]) 

In [None]:

def plot_gene_batches(df, batch_size=50):
    genes = df['Gene'].unique()
    n_batches = (len(genes) + batch_size - 1) // batch_size
    
    for batch in range(n_batches):
        batch_genes = genes[batch * batch_size:(batch + 1) * batch_size]
        batch_df = df[df['Gene'].isin(batch_genes)]
        
        plt.figure(figsize=(15, 10))
        sns.boxplot(x='Gene', y='Score', hue='Classification', data=batch_df)
        plt.xticks(rotation=90)
        plt.title(f'Gene Scores Based on ecDNA Presence (Batch {batch + 1})')
        plt.show()

plot_gene_batches(combined_df, batch_size=50)