# PFA analysis

In [None]:
https://github.com/LauritzR/Principal-Feature-Analysis

In [None]:
https://github.com/AC-PHD/Seurat_PFA_pipeline

# Prepare environment


In [None]:
!conda create -n PFA python=3.8 -y

In [None]:
!conda activate PFA

In [None]:
!conda install -n PFA csv random os ipython -y

In [None]:
!conda install -n PFA ipykernel -y

In [None]:
!python -m ipykernel install --user --name PFA --display-name "Python (PFA)"

In [None]:
!conda install -c anaconda pandas

In [None]:
!conda install -c bioconda scanpy

In [None]:
!conda install -c anaconda requests

In [None]:
!

# Prepare the PFA input matrix

In [29]:
import scanpy as sc
import pandas as pd
import anndata as ad
import os


## Load and subset the adata object

In [21]:
adata = sc.read("/storage/users/sac43cg/res_Samantha_1/outs/per_sample_outs/time_adata.h5ad")
#adata = sc.read("/storage/users/sac43cg/res_Samantha_1/outs/per_sample_outs/time_adata.h5ad")

In [23]:
# Define the substrings you're interested in
substrings = ['CTRL', 'TGFB1', 'GEM']

# Filter gene names by checking if they contain any of the substrings
filtered_genes = [name for name in adata.var_names if any(substring in name for substring in substrings)]

In [24]:
# Filter the AnnData object to keep only the selected genes
adata_filtered = adata[:, filtered_genes]

In [25]:
adata

AnnData object with n_obs × n_vars = 10594 × 580
    obs: 'condition', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'pct_cmo', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_combined', 'louvain', 'louvain_combined', 'index', 'IDs', 't', 'seg', 'edge', 't_sd', 'milestones', '43->30_lindev_sel', '43->94_lindev_sel'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'CMO', 'gene_symbol', 'mito', 'ribo', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', '43->30_rss', 'p_val', 'A', 'fdr', 'st', 'signi'
    uns: '43->30_lindev_sel_colors', 'condition_colors', 'dendro_segments', 'draw_graph', 'graph', 'hvg', 'leiden', 'leiden_colors', 'lei

## Prepare the Group-Annotation of the Count Matrix

In [33]:
# Assuming you have already loaded your Adata object
# Replace 'adata' with your Adata object's variable name
# and make sure to replace 'leiden' with the correct clustering key

# Create a DataFrame for the Leiden annotation
leiden_series = pd.Series(adata.obs['leiden'], name='Label')

# Extract the count matrix from adata
count_matrix = adata.X

count_matrix = count_matrix.T

# Convert the count matrix to a DataFrame with genes as rows and samples as columns
count_df = pd.DataFrame(count_matrix.toarray(), index=adata.var_names, columns=adata.obs_names)

# Add the Leiden annotation as the first row
count_df = pd.concat([leiden_series.to_frame().T, count_df], axis=0)

In [34]:
count_df = count_df[count_df.index.notnull() & (count_df.index != '')]


In [35]:
count_df



barcode,AAACCCAGTAGGCTCC-1,AAACCCAGTCGCAACC-1,AAACCCATCCACAAGT-1,AAACGCTTCACTACGA-1,AAACGCTTCAGCCTCT-1,AAAGAACAGATACAGT-1,AAAGAACAGCCTCGTG-1,AAAGAACAGTAATCCC-1,AAAGAACCAACATCGT-1,AAAGAACGTCGCGGTT-1,...,TTGTGGATCTAGCAAC-1,TTGTTCACAGGTCAAG-1,TTGTTTGAGCTCATAC-1,TTTACGTTCAAGGCTT-1,TTTACTGCAGCTACAT-1,TTTACTGTCATTACTC-1,TTTCACAGTGATATAG-1,TTTCACATCGATGCAT-1,TTTCATGTCCCTCTAG-1,TTTGACTGTTCCGGTG-1
Label,1,11,5,8,6,1,6,8,6,1,...,3,3,7,7,3,14,3,3,7,3
HES4,1.583742,1.817618,0.0,1.823131,0.470256,1.036066,0.88049,1.708302,2.782923,1.21865,...,1.621794,2.052788,2.241301,0.0,1.883256,2.67967,1.877891,1.520372,1.049049,0.763774
ISG15,0.0,0.0,0.0,0.0,0.470256,0.0,0.0,0.918898,0.866574,0.0,...,0.0,0.0,1.335489,0.0,1.050225,1.127936,0.802469,0.0,0.0,0.763774
C1QTNF12,0.0,2.106576,0.0,0.0,0.0,0.0,0.0,0.0,0.524357,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MFAP2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.335489,0.0,1.7309,0.0,0.479658,1.520372,1.049049,0.763774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DANT2,0.0,1.634275,0.0,0.0,1.946486,0.646608,0.0,0.0,0.0,0.0,...,0.0,0.0,1.335489,0.0,1.7309,0.71522,0.479658,2.097755,0.0,0.0
KLHL13,0.0,0.0,0.0,0.0,1.030051,1.036066,0.0,1.949632,0.0,0.0,...,0.0,0.0,0.0,0.0,1.33169,0.0,0.0,1.520372,0.0,0.763774
ZNF185,0.0,0.0,1.291329,0.712088,0.0,0.646608,0.0,0.0,0.0,1.21865,...,1.858828,2.052788,1.887631,0.0,1.7309,0.0,0.479658,2.097755,0.0,2.199947
L1CAM,0.0,0.0,0.0,0.712088,0.0,0.0,0.88049,0.0,0.0,0.784499,...,0.0,1.480421,1.335489,0.0,0.657078,0.71522,0.479658,2.097755,0.0,0.0


## Print the matrix and save to csv

In [42]:
# Assuming count_df is your DataFrame
# Example: count_df = pd.DataFrame(...)

# If your first column 'barcode' is not the index, set it as index
if count_df.columns[0] == 'barcode':
    count_df = count_df.set_index('barcode')

# Directory where you want to save the file
directory = '../../data'

# Create the directory if it does not exist
if not os.path.exists(directory):
    os.makedirs(directory)

# File path for the csv file
file_path = os.path.join(directory, 'output.csv')

# Save the DataFrame to a csv file, including the index (barcode)
count_df.to_csv(file_path)

print(f"DataFrame saved to {file_path}")

DataFrame saved to ../../data/output.csv


In [43]:
count_df

barcode,AAACCCAGTAGGCTCC-1,AAACCCAGTCGCAACC-1,AAACCCATCCACAAGT-1,AAACGCTTCACTACGA-1,AAACGCTTCAGCCTCT-1,AAAGAACAGATACAGT-1,AAAGAACAGCCTCGTG-1,AAAGAACAGTAATCCC-1,AAAGAACCAACATCGT-1,AAAGAACGTCGCGGTT-1,...,TTGTGGATCTAGCAAC-1,TTGTTCACAGGTCAAG-1,TTGTTTGAGCTCATAC-1,TTTACGTTCAAGGCTT-1,TTTACTGCAGCTACAT-1,TTTACTGTCATTACTC-1,TTTCACAGTGATATAG-1,TTTCACATCGATGCAT-1,TTTCATGTCCCTCTAG-1,TTTGACTGTTCCGGTG-1
Label,1,11,5,8,6,1,6,8,6,1,...,3,3,7,7,3,14,3,3,7,3
HES4,1.583742,1.817618,0.0,1.823131,0.470256,1.036066,0.88049,1.708302,2.782923,1.21865,...,1.621794,2.052788,2.241301,0.0,1.883256,2.67967,1.877891,1.520372,1.049049,0.763774
ISG15,0.0,0.0,0.0,0.0,0.470256,0.0,0.0,0.918898,0.866574,0.0,...,0.0,0.0,1.335489,0.0,1.050225,1.127936,0.802469,0.0,0.0,0.763774
C1QTNF12,0.0,2.106576,0.0,0.0,0.0,0.0,0.0,0.0,0.524357,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MFAP2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.335489,0.0,1.7309,0.0,0.479658,1.520372,1.049049,0.763774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DANT2,0.0,1.634275,0.0,0.0,1.946486,0.646608,0.0,0.0,0.0,0.0,...,0.0,0.0,1.335489,0.0,1.7309,0.71522,0.479658,2.097755,0.0,0.0
KLHL13,0.0,0.0,0.0,0.0,1.030051,1.036066,0.0,1.949632,0.0,0.0,...,0.0,0.0,0.0,0.0,1.33169,0.0,0.0,1.520372,0.0,0.763774
ZNF185,0.0,0.0,1.291329,0.712088,0.0,0.646608,0.0,0.0,0.0,1.21865,...,1.858828,2.052788,1.887631,0.0,1.7309,0.0,0.479658,2.097755,0.0,2.199947
L1CAM,0.0,0.0,0.0,0.712088,0.0,0.0,0.88049,0.0,0.0,0.784499,...,0.0,1.480421,1.335489,0.0,0.657078,0.71522,0.479658,2.097755,0.0,0.0


# Test reading the Input PFA table

In [41]:
!pwd

/home2/lumpi/Projects/single-cell_samantha/code/scripts


In [None]:
import pandas as pd

# Assuming 'path' is the path to your CSV file
path="../../data/output.csv" #Please insert correct file input name

# Read the CSV file, without considering any row as header initially
data = pd.read_csv(path, header=None)

# Manually set the second row as the header
data.columns = data.iloc[1].astype(str)

# Drop the first two rows now that headers are set
data = data.drop([0, 1])

# Reset the index of the DataFrame
data.reset_index(drop=True, inplace=True)

# Display the DataFrame to verify the header and the first few rows
print(data.head())

In [47]:
data

1,Label,1.1,11,5,8,6,1.2,6.1,8.1,6.2,...,3,3.1,7,7.1,3.2,14,3.3,3.4,7.2,3.5
0,HES4,1.583741545677185,1.8176181316375732,0.0,1.8231309652328491,0.470255583524704,1.0360662937164307,0.880490243434906,1.7083015441894531,2.7829229831695557,...,1.621793508529663,2.052788496017456,2.2413008213043213,0.0,1.8832560777664185,2.6796696186065674,1.8778905868530273,1.5203723907470703,1.0490491390228271,0.7637744545936584
1,ISG15,0.0,0.0,0.0,0.0,0.470255583524704,0.0,0.0,0.9188977479934692,0.8665735125541687,...,0.0,0.0,1.3354885578155518,0.0,1.0502245426177979,1.1279358863830566,0.8024693727493286,0.0,0.0,0.7637744545936584
2,C1QTNF12,0.0,2.1065757274627686,0.0,0.0,0.0,0.0,0.0,0.0,0.5243574976921082,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,MFAP2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.3354885578155518,0.0,1.7308998107910156,0.0,0.47965794801712036,1.5203723907470703,1.0490491390228271,0.7637744545936584
4,PADI2,0.0,0.0,0.0,0.712087869644165,0.0,2.112929344177246,0.0,0.0,0.0,...,0.0,0.0,0.0,0.8673115968704224,1.3316901922225952,0.0,0.47965794801712036,0.0,2.1305789947509766,0.7637744545936584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
575,DANT2,0.0,1.634275,0.0,0.0,1.946486,0.646608,0.0,0.0,0.0,...,0.0,0.0,1.335489,0.0,1.7309,0.71522,0.479658,2.097755,0.0,0.0
576,KLHL13,0.0,0.0,0.0,0.0,1.030051,1.036066,0.0,1.949632,0.0,...,0.0,0.0,0.0,0.0,1.33169,0.0,0.0,1.520372,0.0,0.763774
577,ZNF185,0.0,0.0,1.291329,0.712088,0.0,0.646608,0.0,0.0,0.0,...,1.858828,2.052788,1.887631,0.0,1.7309,0.0,0.479658,2.097755,0.0,2.199947
578,L1CAM,0.0,0.0,0.0,0.712088,0.0,0.0,0.88049,0.0,0.0,...,0.0,1.480421,1.335489,0.0,0.657078,0.71522,0.479658,2.097755,0.0,0.0


# Download and Execute PFA related scripts

## Download all PFA folder

In [55]:
!git clone https://github.com/AC-PHD/Seurat_PFA_pipeline.git

Cloning into 'Seurat_PFA_pipeline'...
remote: Enumerating objects: 247, done.[K
remote: Counting objects: 100% (105/105), done.[K
remote: Compressing objects: 100% (105/105), done.[K
remote: Total 247 (delta 49), reused 0 (delta 0), pack-reused 142[K
Receiving objects: 100% (247/247), 562.21 KiB | 11.96 MiB/s, done.
Resolving deltas: 100% (91/91), done.


# PFA 1: 

## go to folder Seurat_PFA_pipeline/06_PFA

In [None]:
### 01_Prepare_Data_et

In [None]:
### 02_PFA_gene_selectiion

In [None]:
### 03_Vaidtae_PFA_Results