In [2]:
!pip install pandas numpy scipy plotly



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [3]:
import pandas as pd
import numpy as np
from scipy import stats
import plotly.express as px

In [6]:

# --- 1. Load the Dataset ---
# Load the CSV file into a pandas DataFrame.
try:
    df = pd.read_csv('fpkm_counts_with_annotations.csv')
    print("File loaded successfully!")
except FileNotFoundError:
    print("Error: 'fpkm_counts_with_annotations.csv' not found.")
    print("Please make sure the CSV file is in the same directory as your notebook.")
    # Stop execution if the file is not found
    exit()


# --- 2. Define Sample Groups ---
# Group the sample columns for easy comparison.
groups = {
    'EBY': ['EBY_1', 'EBY_2', 'EBY_3'],
    'GNSR': ['EBY-GNSR_1', 'EBY-GNSR_2'],
    'GNHR': ['EBY-GNHR_1', 'EBY-GNHR_2', 'EBY-GNHR_3'],
    'GNUR': ['EBY-GNUR_1', 'EBY-GNUR_2', 'EBY-GNUR_3']
}


# --- 3. Define a Function for Differential Expression Analysis ---
def calculate_differential_expression(df, group1_name, group2_name):
    """
    Calculates log2 fold change and p-value for a comparison between two groups.
    
    Args:
        df (pd.DataFrame): The input DataFrame with gene expression data.
        group1_name (str): The name of the first group (e.g., 'GNSR').
        group2_name (str): The name of the second group (e.g., 'EBY').

    Returns:
        pd.DataFrame: A new DataFrame with results for the comparison.
    """
    group1_cols = groups[group1_name]
    group2_cols = groups[group2_name]

    # Use a small pseudo-count to avoid errors with log(0) or division by zero.
    pseudo_count = 1e-4
    
    # Calculate the mean for each group
    mean_group1 = df[group1_cols].mean(axis=1) + pseudo_count
    mean_group2 = df[group2_cols].mean(axis=1) + pseudo_count

    # Calculate log2 fold change
    log2_fold_change = np.log2(mean_group1 / mean_group2)

    # Perform an independent t-test to get the p-value
    # 'nan_policy='omit'' handles any potential missing values
    t_stat, p_value = stats.ttest_ind(
        df[group1_cols], df[group2_cols], axis=1, equal_var=False, nan_policy='omit'
    )

    # Create a results DataFrame
    result_df = pd.DataFrame({
        'Geneid': df['Geneid'],
        'log2FoldChange': log2_fold_change,
        'pvalue': p_value
    })
    
    # Calculate -log10(pvalue) and handle p-values of 0
    result_df['-log10(pvalue)'] = -np.log10(result_df['pvalue'].replace(0, 1e-300))
    result_df['comparison'] = f'{group1_name} / {group2_name}'
    
    return result_df


# --- 4. Prepare Data for the Plots ---

# Define the comparisons needed for each plot
plot1_comparisons = [('GNSR', 'EBY'), ('GNUR', 'EBY'), ('GNHR', 'EBY')]
plot2_comparisons = [('EBY', 'GNHR'), ('GNSR', 'GNHR'), ('GNUR', 'GNHR')]

# Calculate results and concatenate them into a single DataFrame for each plot
plot1_df = pd.concat([calculate_differential_expression(df, g1, g2) for g1, g2 in plot1_comparisons])
plot2_df = pd.concat([calculate_differential_expression(df, g1, g2) for g1, g2 in plot2_comparisons])


# --- 5. Generate and Display the Interactive Plots ---

# Plot 1: GNSR/EBY, GNUR/EBY, GNHR/EBY
print("\n--- Generating Plot 1: Comparisons vs. EBY ---")
fig1 = px.scatter(
    plot1_df,
    x='log2FoldChange',
    y='-log10(pvalue)',
    color='comparison',  # Color points by the comparison group
    hover_name='Geneid', # Display Geneid on hover
    title='Volcano Plot: GNSR, GNUR, GNHR vs. EBY',
    labels={
        "log2FoldChange": "log2 Fold Change",
        "-log10(pvalue)": "-log10(p-value)"
    }
)

# Add lines for significance thresholds (optional, but good practice)
fig1.add_hline(y=-np.log10(0.05), line_dash="dash", line_color="grey", annotation_text="p=0.05")
fig1.add_vline(x=1, line_dash="dash", line_color="grey", annotation_text="FC=2")
fig1.add_vline(x=-1, line_dash="dash", line_color="grey")

fig1.show()


# Plot 2: EBY/GNHR, GNSR/GNHR, GNUR/GNHR
print("\n--- Generating Plot 2: Comparisons vs. GNHR ---")
fig2 = px.scatter(
    plot2_df,
    x='log2FoldChange',
    y='-log10(pvalue)',
    color='comparison',
    hover_name='Geneid',
    title='Volcano Plot: EBY, GNSR, GNUR vs. GNHR',
    labels={
        "log2FoldChange": "log2 Fold Change",
        "-log10(pvalue)": "-log10(p-value)"
    }
)

# Add lines for significance thresholds
fig2.add_hline(y=-np.log10(0.05), line_dash="dash", line_color="grey", annotation_text="p=0.05")
fig2.add_vline(x=1, line_dash="dash", line_color="grey", annotation_text="FC=2")
fig2.add_vline(x=-1, line_dash="dash", line_color="grey")

fig2.show()

File loaded successfully!

--- Generating Plot 1: Comparisons vs. EBY ---



Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.


Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.




--- Generating Plot 2: Comparisons vs. GNHR ---


In [12]:
# --- 1. Load and Clean the Dataset ---
try:
    # FIX 1: Using the correct file name provided during upload.
    df = pd.read_csv('fpkm_counts_with_annotations.csv')
    
    # FIX 2: Clean up column names to prevent KeyErrors from hidden whitespace.
    df.columns = df.columns.str.strip()
    
    print("File loaded successfully!")
    print("Columns found:", df.columns.tolist()) # This helps confirm columns are read correctly
    
except FileNotFoundError:
    print("Error: 'fpkm_counts_with_annotations (1).csv' not found.")
    print("Please make sure the CSV file is in the same directory as your notebook.")
    exit()


# --- 2. Define Sample Groups ---
groups = {
    'EBY': ['EBY_1', 'EBY_2', 'EBY_3'],
    'GNSR': ['EBY-GNSR_1', 'EBY-GNSR_2'],
    'GNHR': ['EBY-GNHR_1', 'EBY-GNHR_2', 'EBY-GNHR_3'],
    'GNUR': ['EBY-GNUR_1', 'EBY-GNUR_2', 'EBY-GNUR_3']
}


# --- 3. Define a Function for Differential Expression Analysis ---
def calculate_differential_expression(df, group1_name, group2_name):
    group1_cols = groups[group1_name]
    group2_cols = groups[group2_name]
    pseudo_count = 1e-4
    
    mean_group1 = df[group1_cols].mean(axis=1) + pseudo_count
    mean_group2 = df[group2_cols].mean(axis=1) + pseudo_count
    log2_fold_change = np.log2(mean_group1 / mean_group2)

    t_stat, p_value = stats.ttest_ind(
        df[group1_cols], df[group2_cols], axis=1, equal_var=False, nan_policy='omit'
    )

    result_df = pd.DataFrame({
        'Geneid': df['Geneid'],
        'log2FoldChange': log2_fold_change,
        'pvalue': p_value,
        'comparison': f'{group1_name} / {group2_name}'
    })
    return result_df


# --- 4. Prepare Data ---
plot1_comparisons = [('GNSR', 'EBY'), ('GNUR', 'EBY'), ('GNHR', 'EBY')]
plot2_comparisons = [('EBY', 'GNHR'), ('GNSR', 'GNHR'), ('GNUR', 'GNHR')]
plot1_df = pd.concat([calculate_differential_expression(df, g1, g2) for g1, g2 in plot1_comparisons])
plot2_df = pd.concat([calculate_differential_expression(df, g1, g2) for g1, g2 in plot2_comparisons])


# --- 5. Filter, Format, and Save Top 5 Genes to a File ---

# Merge the annotations from the original dataframe into our results
annotations_df = df[['Geneid', 'JGI_annotation', 'KEGG_annotation']].copy()
plot1_annotated_df = pd.merge(plot1_df, annotations_df, on='Geneid', how='left')
plot2_annotated_df = pd.merge(plot2_df, annotations_df, on='Geneid', how='left')

output_filename = "top5_gene_report_corrected.txt"

with open(output_filename, 'w') as f:

    def analyze_and_write_results(result_df, title, file_handle):
        file_handle.write("="*80 + "\n")
        file_handle.write(f"Analysis: {title}\n")
        file_handle.write("Filtering for TOP 5 Upregulated (log2FC > 5) and Downregulated (log2FC < -5) Genes\n")
        file_handle.write("Condition: p-value > 0.05\n")
        file_handle.write("="*80 + "\n\n")

        comparisons = result_df['comparison'].unique()
        
        for comp in comparisons:
            file_handle.write(f"--- Comparison Group: {comp} ---\n")
            
            # Isolate the data for the current comparison group
            comparison_df = result_df[result_df['comparison'] == comp]
            
            # --- CORRECTED LOGIC ---
            
            # 1. Filter for UPREGULATED genes
            upregulated_condition = (comparison_df['log2FoldChange'] > 5) & (comparison_df['pvalue'] > 0.05)
            top_5_upregulated = comparison_df[upregulated_condition].sort_values(by='log2FoldChange', ascending=False).head(5)

            # 2. Filter for DOWNREGULATED genes
            downregulated_condition = (comparison_df['log2FoldChange'] < -5) & (comparison_df['pvalue'] > 0.05)
            top_5_downregulated = comparison_df[downregulated_condition].sort_values(by='log2FoldChange', ascending=True).head(5)

            # Write the upregulated list
            file_handle.write("\n  Top 5 Most Upregulated:\n")
            if top_5_upregulated.empty:
                file_handle.write("    - None found meeting the criteria.\n")
            else:
                for index, row in top_5_upregulated.iterrows():
                    f.write(f"    Gene: {row['Geneid']} (log2FC: {row['log2FoldChange']:.2f})\n")
                    f.write(f"      - p-value: {row['pvalue']:.4f}\n")
                    f.write(f"      - JGI Annotation: {row['JGI_annotation']}\n")
                    f.write(f"      - KEGG Annotation: {row['KEGG_annotation']}\n")

            # Write the downregulated list
            file_handle.write("\n  Top 5 Most Downregulated:\n")
            if top_5_downregulated.empty:
                file_handle.write("    - None found meeting the criteria.\n\n")
            else:
                for index, row in top_5_downregulated.iterrows():
                    f.write(f"    Gene: {row['Geneid']} (log2FC: {row['log2FoldChange']:.2f})\n")
                    f.write(f"      - p-value: {row['pvalue']:.4f}\n")
                    f.write(f"      - JGI Annotation: {row['JGI_annotation']}\n")
                    f.write(f"      - KEGG Annotation: {row['KEGG_annotation']}\n")
            
            file_handle.write("\n")


    # Run the analysis for both sets of comparisons and write to the file
    analyze_and_write_results(plot1_annotated_df, "Comparisons vs. EBY", f)
    f.write("\n\n")
    analyze_and_write_results(plot2_annotated_df, "Comparisons vs. GNHR", f)

# --- 6. Final Confirmation ---
print(f"✅ Report successfully generated and saved to '{output_filename}'")

File loaded successfully!
Columns found: ['Geneid', 'EBY_1', 'EBY_2', 'EBY_3', 'EBY-U_1', 'EBY-GNSR_1', 'EBY-GNSR_2', 'EBY-GNHR_1', 'EBY-GNHR_2', 'EBY-GNHR_3', 'EBY-GNUR_1', 'EBY-GNUR_2', 'EBY-GNUR_3', 'JGI_annotation', 'KEGG_annotation']
✅ Report successfully generated and saved to 'top5_gene_report_corrected.txt'



Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.


Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.

