
# How to create a abundance matrix for all the libraries? est_counts
This Python script is designed to take in a list of input files containing expression data for a set of biological samples, and output a single TSV file containing the est_counts values for each sample.

Overall, the script takes in expression data for multiple biological samples and outputs a single TSV file containing the est_counts values for each transcript in each sample. The est_counts values are aggregated across all input files, which makes it easier to compare expression levels between samples.

In [5]:
import os
import pandas as pd

# Define the directory path where the input files are located
input_dir = os.getcwd()  # Current working directory

# Get a list of TSV files in the input directory
input_files = [file for file in os.listdir(input_dir) if file.endswith(".tsv")]

# Create a dictionary to store the est_counts values
est_counts_dict = {}

# Loop over the input files and populate the est_counts dictionary
for input_file_path in input_files:
    with open(input_file_path, "r") as input_file:
        # Get the header (assuming first line is the header)
        header = next(input_file).strip().split("\t")
        # Capture the column index for est_counts, assuming it's the fourth column
        est_counts_index = 3
        
        for line in input_file:
            values = line.strip().split("\t")
            if len(values) < 5:
                continue  # Skip lines that don't have enough columns
            target_id = values[0]
            est_counts = values[est_counts_index]  # Get est_counts

            if target_id not in est_counts_dict:
                # Initialize list with None values for all files
                est_counts_dict[target_id] = [None] * len(input_files)

            # Update the appropriate index based on the current file.
            file_index = input_files.index(os.path.basename(input_file_path))
            est_counts_dict[target_id][file_index] = est_counts

# Write the est_counts dictionary to a TSV file
output_file_path = "est_counts_matrix_Trinity_Ticks_2024_AUG.tsv"
with open(output_file_path, "w") as output_file:
    # Write the header line
    output_file.write("target_id\t" + "\t".join(input_files) + "\n")
    
    for target_id, est_counts_list in est_counts_dict.items():
        # Convert None to an empty string for output
        est_counts_list = [str(ec) if ec is not None else "" for ec in est_counts_list]
        output_file.write(target_id + "\t" + "\t".join(est_counts_list) + "\n")


### sorted by columns names

In [6]:
import pandas as pd

# Load the data from the CSV file
data = pd.read_csv('est_counts_matrix_Trinity_Ticks_2024_AUG.tsv', delimiter='\t')

# Sort the columns by name
sorted_columns = sorted(data.columns[1:])

# Create a new DataFrame with the sorted columns
sorted_data = data[['target_id'] + sorted_columns]

# Save the sorted DataFrame to a new CSV file
sorted_data.to_csv('your_reorganized_est_counts_matrix_Trinity_2024_Ticks.csv', index=False)


In [3]:
import pandas as pd
import glob
import os
from functools import reduce

def aggregate_est_counts(input_directory, output_file):
    filenames = glob.glob(os.path.join(input_directory, "abundance_*.tsv"))
    data_frames = []

    for file in filenames:
        df = pd.read_csv(file, sep='\t')
        
        # Extract a unique sample name using more parts of the filename
        # This assumes structure: abundance_<unique_sample_portion>.tsv
        sample_name = os.path.splitext(os.path.basename(file))[0].replace("abundance_", "")
        
        df = df[['target_id', 'tpm']]
        df.rename(columns={"tpm": sample_name}, inplace=True)
        
        data_frames.append(df)
    
    # Function to merge two dataframes on 'target_id'
    def merge_dataframes(left, right):
        return pd.merge(left, right, on='target_id', how='outer', suffixes=("", "_y")).fillna(0)
    
    # Reduce using the custom merge function
    merged_df = reduce(merge_dataframes, data_frames)
    
    # Drop any duplicate columns that may arise
    merged_df = merged_df.loc[:, ~merged_df.columns.str.endswith('_y')]

    merged_df.to_csv(output_file, sep='\t', index=False)

# Example usage
#input_directory = 'path/to/directory/with/tsv/files'  # Set to your directory path
#output_file = 'aggregated_est_counts.tsv'  # Set to your desired output file name
#aggregate_est_counts(input_directory, output_file)


In [4]:
# Example use
input_directory = '/Users/zaidemontesortiz/Documents/1_PhD_project_Spiders_Ticks_Beetles/Ticks/Results/Diff_Expression_Analysis/Kallisto_2024'  # Change to the path of your input files
output_file = 'aggregated_tpm_counts.tsv'  # Change to your desired output file name
aggregate_est_counts(input_directory, output_file)