In [1]:
import os
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import re
import seaborn as sns

In [2]:
columns = ['gene_id', 'm1', 'm2','m3','m4','m5']
DNMT = pd.read_csv()

In [None]:
print(DNMT)

In [None]:
DNMT_ERV = DNMT[DNMT['gene_id'].str.startswith('Mmus')]

In [None]:
def filter_non_expressed_ERV(df):
    df = df.copy()
    count_columns = [col for col in df.columns if col.startswith('m')]
    
    # Sum values along the rows for selected columns
    df.loc[:, 'sum_counts'] = df[count_columns].astype(int).sum(axis=1)
    
    # Filter rows where sum_counts is greater than 0
    df = df.loc[df['sum_counts'] > 0]
    
    # Reset index starting from 0
    df.reset_index(drop=True, inplace=True)
    
    # Drop the temporary 'sum_counts' column
    del df['sum_counts']
    
    return df


In [None]:
expr_ERV = filter_non_expressed_ERV(TET2_ERV)

In [None]:
def reshape_dataframe(df):
    id_vars = ['gene_id']
    value_vars = [col for col in df.columns if col.startswith('m')]
    
    # Melt the DataFrame
    melted_df = pd.melt(df, id_vars=id_vars, value_vars=value_vars, var_name='patient_id', value_name='count')
    
    # Extract patient_id from column names
    #melted_df['patient_id'] = melted_df['patient_id'].str.split('_', expand=True)[1].astype(int)
    
    return melted_df

In [None]:
reshaped = reshape_dataframe(expr_ERV)
print(reshaped.head())

In [None]:
def visualise_ERV_expression(df, plot_type, threshold = False):
    plt.figure(figsize=(30, 30)) 

    if plot_type == 'scatter':
        sns.scatterplot(data=df, x='gene_id', y='count', hue='patient_id', palette = 'tab10') #s=4
    if plot_type == 'line':
        sns.lineplot(data=df, x='gene_id', y='count', hue='patient_id', palette = 'tab10')
    
    plt.xlabel('ERV id')
    plt.ylabel('count')

    plt.title('ERV expression levels measured with TEtranscripts')

    # Adding legend
    plt.legend(title='Mouse ID', loc='upper right')
    if threshold == True:
        plt.xticks(rotation=90)

    plt.show()    

In [None]:
visualise_ERV_expression(reshaped, 'line')

In [None]:
def visualise_ERV_expression_gropus(df, plot_type, threshold = False):
    plt.figure(figsize=(30, 30)) 

    palette = {'m1': 'red', 'm2': 'red', 'm3': 'blue', 'm4': 'blue', 'm6': 'blue'}

    if plot_type == 'scatter':
        sns.scatterplot(data=df, x='gene_id', y='count', hue='patient_id', palette = palette) #s=4
    if plot_type == 'line':
        sns.lineplot(data=df, x='gene_id', y='count', hue='patient_id', palette = palette)
    
    plt.xlabel('ERV id')
    plt.ylabel('count')

    plt.title('ERV expression levels measured with TEtranscripts')

    # Adding legend
    plt.legend(title='Mouse ID', loc='upper right')

    if threshold == True:
        plt.xticks(rotation=90)

    plt.show()    

In [None]:
visualise_ERV_expression_gropus(reshaped, 'line')

In [None]:
def remove_lowly_expressed(df, threshold):
    counts_cols = [col for col in df.columns if col.startswith('m')]
    mask = (df[counts_cols] >= threshold).any(axis=1) & (df[counts_cols] <= 500).all(axis=1)
    filtered_df = df[mask]
    filtered_df.reset_index(drop=True, inplace=True)
    return filtered_df


In [None]:
filtered = remove_lowly_expressed(expr_ERV, 20)
print(filtered.head())

reshaped_filtered = reshape_dataframe(filtered)
print(reshaped_filtered.head())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def visualise_barplot(df,plot_name, threshold=False):
    plt.figure(figsize=(30, 30)) 
    palette = {'m1': 'red', 'm2': 'red', 'm3': 'blue', 'm4': 'blue', 'm5': 'blue'}
    
    sns.barplot(data=df, x='gene_id', y='count', hue='patient_id', palette=palette) #s=4
    
    plt.xlabel('ERV id')
    plt.ylabel('count')

    plt.title('ERV expression levels measured with TEtranscripts')

    # Adding legend
    plt.legend(title='Mouse ID', loc='upper right')
    if threshold:
        plt.xticks(rotation=90)
    
    # Save plot under the DataFrame name
    plt.savefig(plot_name)
    plt.show()

# Example usage:
# Assuming df has been defined and named as follows
# df.name = 'my_dataframe'
# visualise_barplot(df, threshold=True)


In [None]:
visualise_barplot(reshaped_filtered , 'full_DNMT3A.jpg')