In [14]:
import numpy as np
import pandas as pd
import ast
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as sk
import scipy.stats as stat
import re

## AD specificity

In [None]:
adsmapped = pd.read_csv("maps/ADs_mapped.tsv", sep="\t", quoting=3)
ads_specificity = pd.read_csv("../cofactors_humanproteinatlas/helperdata/AD_specificity.csv").drop("Unnamed: 0", axis=1)

#Clean TAU columns
for col_label in ads_specificity.columns.to_list():
    if "TAU" in col_label:
        ads_specificity[col_label] = ads_specificity[col_label].apply(lambda x: np.NaN if x == "Not detected" else float(x))

adsmapped["Bgee"] = [str(i).replace(";", "") for i in adsmapped["Bgee"]]
ads = adsmapped.merge(ads_specificity, left_on="Bgee", right_on="Ensembl", how="left").drop_duplicates()
ads.head()

Index(['Gene', 'Ensembl', 'Single Cell Type RNA - Adipocytes [nTPM]',
       'Single Cell Type RNA - Alveolar cells type 1 [nTPM]',
       'Single Cell Type RNA - Alveolar cells type 2 [nTPM]',
       'Single Cell Type RNA - Astrocytes [nTPM]',
       'Single Cell Type RNA - B-cells [nTPM]',
       'Single Cell Type RNA - Basal keratinocytes [nTPM]',
       'Single Cell Type RNA - Basal prostatic cells [nTPM]',
       'Single Cell Type RNA - Basal respiratory cells [nTPM]',
       'Single Cell Type RNA - Basal squamous epithelial cells [nTPM]',
       'Single Cell Type RNA - Bipolar cells [nTPM]',
       'Single Cell Type RNA - Breast glandular cells [nTPM]',
       'Single Cell Type RNA - Breast myoepithelial cells [nTPM]',
       'Single Cell Type RNA - Cardiomyocytes [nTPM]',
       'Single Cell Type RNA - Cholangiocytes [nTPM]',
       'Single Cell Type RNA - Ciliated cells [nTPM]',
       'Single Cell Type RNA - Club cells [nTPM]',
       'Single Cell Type RNA - Collecting duct ce

In [None]:
ads_scRNA = pd.read_csv("../cofactors_humanproteinatlas/helperdata/AD_scRNA.csv").drop("Unnamed: 0", axis=1)

In [53]:
def cleanmotif(text):
    # Extract positions and notes
    positions = re.findall(r'MOTIF (\d+)\.\.(\d+);', text)
    descriptions = re.findall(r'/note="([^"]+)"', text)
    # Convert positions to tuples of integers
    positions = [(int(start), int(end)) for start, end in positions]
    # Clean descriptions (remove trailing numbering)
    cleaned_descriptions = [re.sub(r'\s\d+$', '', desc) for desc in descriptions]
    return cleaned_descriptions, positions

def cleandomain(text):
    # Extract positions and notes
    positions = re.findall(r'DOMAIN (\d+)\.\.(\d+);', text)
    descriptions = re.findall(r'/note="([^"]+)"', text)
    # Convert positions to tuples of integers
    positions = [(int(start), int(end)) for start, end in positions]
    # Clean descriptions (remove trailing numbering)
    cleaned_descriptions = [re.sub(r'\s\d+$', '', desc) for desc in descriptions]
    return cleaned_descriptions, positions


def RNAcellspecificity(df):
    """
    Generates a DF cell specificity genes
    """
    df = df[["Gene", "RNA single cell type specific nTPM"]]

    cellspecific_cells = pd.DataFrame({"Cell Type": [], "Genes": []}).set_index("Cell Type")

    for _, row in df.iterrows():
        celldict = row["RNA single cell type specific nTPM"]
        if pd.notna(celldict):
            cell_dict = ast.literal_eval(celldict)
            for cell, _ in cell_dict.items():
                if cell not in cellspecific_cells.index.to_list():
                    cellspecific_cells.loc[cell] = [[]]
                cellspecific_cells.at[cell, "Genes"].append(row["Gene"])

    return cellspecific_cells


In [35]:
adTFmotifs = ads[["Entry", "Motif", "Bgee"]]
adTFmotifs[['Motifs', 'Motif Positions']] = adTFmotifs['Motif'].apply(lambda x: pd.Series(cleanmotif(str(x))))
adTFmotifs = adTFmotifs.drop("Motif", axis=1)

adTFmotifs.head()

  adTFmotifs[['Motifs', 'Motif Positions']] = adTFmotifs['Motif'].apply(lambda x: pd.Series(cleanmotif(str(x))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adTFmotifs[['Motifs', 'Motif Positions']] = adTFmotifs['Motif'].apply(lambda x: pd.Series(cleanmotif(str(x))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adTFmotifs[['Motifs', 'Motif Positions']] = adTFmotifs['Motif'].apply(lambda x: pd.Series(cleanmotif(str(x))))


Unnamed: 0,Entry,Bgee,Motifs,Motif Positions
0,Q6UWZ7,ENSG00000163322,[pSXXF motif],"[(406, 409)]"
1,Q8IUX7,ENSG00000106624,[],[]
2,Q8WYP5,ENSG00000153207,[],[]
3,Q8WYP5,ENSG00000153207,[],[]
4,Q8WYP5,ENSG00000153207,[],[]


In [29]:
adTFdomains = adsmapped[["Entry", 'Domain [FT]', "Bgee"]]
adTFdomains[['Domains', 'Domain Positions']] = adsmapped['Domain [FT]'].apply(lambda x: pd.Series(cleandomain(str(x))))
adTFdomains = adTFdomains.drop("Domain [FT]", axis=1)

adTFdomains.head()

  adTFdomains[['Domains', 'Domain Positions']] = adsmapped['Domain [FT]'].apply(lambda x: pd.Series(cleandomain(str(x))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adTFdomains[['Domains', 'Domain Positions']] = adsmapped['Domain [FT]'].apply(lambda x: pd.Series(cleandomain(str(x))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adTFdomains[['Domains', 'Domain Positions']] = adsmapped['Domain [FT]'].apply(lambda x: pd.Series(cleandomain(str(x))))


Unnamed: 0,Entry,Bgee,Domains,Domain Positions
0,Q6UWZ7,ENSG00000163322,[MPN],"[(7, 160)]"
1,Q8IUX7,ENSG00000106624,"[F5/8 type C, Peptidase M14]","[(383, 540), (563, 904)]"
2,Q8WYP5,ENSG00000153207,[],[]
3,P35869,ENSG00000106546,"[bHLH, PAS, PAS, PAC]","[(27, 80), (111, 181), (275, 342), (348, 386)]"
4,O43918,ENSG00000160224,"[HSR, SAND]","[(1, 105), (181, 280)]"


### Cell specificity of ADs

In [54]:
ads[ads["TAU score - Single Cell Type"] > 0.4]

KeyError: 'TAU score - Single Cell Type'

---
## RDs