In [132]:
import numpy as np
import pandas as pd
import requests
import sys
import regex as re

In [123]:
BAF = pd.read_csv("BAF.csv")
mediator = pd.read_csv("mediator.csv")
P300CBP = pd.read_csv("P300CBP.csv")
SAGA = pd.read_csv("SAGA.csv")
TFIID = pd.read_csv("TFIID.csv")
TIP60 = pd.read_csv("TIP60.csv")
others = pd.read_csv("others.csv")

In [124]:
combined = pd.concat([BAF, mediator, P300CBP, SAGA, TFIID, TIP60, others]).drop_duplicates()
combined.to_csv("cofactors_combined.csv")

In [126]:
mappedcofactors = pd.read_csv("cofactors_mapped.tsv", delimiter="\t")

#Check there is a reviewed version of each Uniprot entry, remove unreviewed versions
if all(mappedcofactors["From"].unique() == mappedcofactors[mappedcofactors["Reviewed"] == "reviewed"]["From"].unique()):
    mappedcofactors = mappedcofactors[mappedcofactors["Reviewed"] == "reviewed"]

#Check all of the cofactors in combined list are included in mapped cofactors (due to name duplication)
cofactornames = " ".join(mappedcofactors["Gene Names"].to_list()).split(" ")
print("All cofactors included? ", all([True for i in combined["Gene Name"] if i in cofactornames]))

mappedcofactors = mappedcofactors.rename({"From": "Gene Name", "Entry": "UniprotID"}, axis = 1).drop(["Reviewed"], axis = 1)

mappedcofactors.head()

All cofactors included?  True


Unnamed: 0,Gene Name,UniprotID,Entry Name,Protein names,Gene Names,Ensembl,GeneID,PDB,Sequence,Alternative products (isoforms)
0,BAF180,Q86U86,PB1_HUMAN,Protein polybromo-1 (hPB1) (BRG1-associated fa...,PBRM1 BAF180 PB1,ENST00000296302.11 [Q86U86-1];ENST00000337303....,55193;,2KTB;3G0J;3HMF;3IU5;3IU6;3K2J;3LJW;3MB4;3TLP;4...,MGSKRRRATSPSSSVSGDFDDGHHSVSTPGPSRKRRRLSNLPTVDP...,ALTERNATIVE PRODUCTS: Event=Alternative splic...
1,BCL11A,Q9H165,BC11A_HUMAN,B-cell lymphoma/leukemia 11A (BCL-11A) (B-cell...,BCL11A CTIP1 EVI9 KIAA1809 ZNF856,ENST00000335712.11 [Q9H165-6];ENST00000356842....,53335;,5VTB;6KI6;6U9Q;8DTN;8DTU;8THO;8TLO;9BV0;,MSRRKQGKPQHLSKREFSPEPLEAILTDDEPDHGPLGAPEGDHDLL...,ALTERNATIVE PRODUCTS: Event=Alternative splic...
20,BCL11B,Q9C0K0,BC11B_HUMAN,B-cell lymphoma/leukemia 11B (BCL-11B) (B-cell...,BCL11B CTIP2 RIT1,ENST00000345514.2 [Q9C0K0-2];ENST00000357195.8...,64919;,,MSRRKQGNPQHLSQRELITPEADHVEAAILEEDEGLEIEEPSGLGL...,ALTERNATIVE PRODUCTS: Event=Alternative splic...
24,BAF60A,Q96GM5,SMRD1_HUMAN,SWI/SNF-related matrix-associated actin-depend...,SMARCD1 BAF60A,ENST00000381513.8 [Q96GM5-2];ENST00000394963.9...,6602;,6LTH;6LTJ;7VDV;7Y8R;,MAARAGFQSVAPSGGAGASGGAGAAAALGPGGTPGPPVRMGPAPGQ...,ALTERNATIVE PRODUCTS: Event=Alternative splic...
25,BAF60B,Q92925,SMRD2_HUMAN,SWI/SNF-related matrix-associated actin-depend...,SMARCD2 BAF60B PRO2451,ENST00000323347.14 [Q92925-3];ENST00000448276....,6603;,,MSGRGAGGFPLPPLSPGGGAVAAALGAPPPPAGPGMLPGPALRGPG...,ALTERNATIVE PRODUCTS: Event=Alternative splic...


In [162]:
data = combined.merge(mappedcofactors, left_on= "Gene Name", right_on="Gene Name")
data.to_csv("cofactors_mapped_combined.csv")

In [200]:
ensembl_list = data[["Gene Name", "UniprotID", "Ensembl", "Sequence", "Alternative products (isoforms)"]].drop_duplicates()

#Replaces alt isoforms with num
num_isoforms = []
for i in ensembl_list["Alternative products (isoforms)"]:
    try:
        num = re.search("(?<=Named isoforms=)[0-9]+", i)[0]
        num_isoforms.append(int(num))
    except:
        num_isoforms.append(0)

ensembl_list["NumIsoforms"] = num_isoforms
ensembl_list = ensembl_list.drop("Alternative products (isoforms)", axis=1).reset_index(drop=True)
ensembl_list.head()

Unnamed: 0,Gene Name,UniprotID,Ensembl,Sequence,NumIsoforms
0,BAF180,Q86U86,ENST00000296302.11 [Q86U86-1];ENST00000337303....,MGSKRRRATSPSSSVSGDFDDGHHSVSTPGPSRKRRRLSNLPTVDP...,9
1,BCL11A,Q9H165,ENST00000335712.11 [Q9H165-6];ENST00000356842....,MSRRKQGKPQHLSKREFSPEPLEAILTDDEPDHGPLGAPEGDHDLL...,5
2,BCL11B,Q9C0K0,ENST00000345514.2 [Q9C0K0-2];ENST00000357195.8...,MSRRKQGNPQHLSQRELITPEADHVEAAILEEDEGLEIEEPSGLGL...,2
3,BAF60A,Q96GM5,ENST00000381513.8 [Q96GM5-2];ENST00000394963.9...,MAARAGFQSVAPSGGAGASGGAGAAAALGPGGTPGPPVRMGPAPGQ...,2
4,BAF60B,Q92925,ENST00000323347.14 [Q92925-3];ENST00000448276....,MSGRGAGGFPLPPLSPGGGAVAAALGAPPPPAGPGMLPGPALRGPG...,3


In [None]:
%%skip

#Generates DF of all ensembl isoforms
def separate_ensemblids(ensembls):
    ens, prot = [], []
    ensembllist = "".join(ensembls).split(";")
    for i in ensembllist[:-1]:
        ensid = re.search("(ENST[0-9]*\.[0-9]*)|(ENST[0-9]*)", i)
        protid = re.search("\[(.*)\]", i)

        if ensid:
            ens.append(ensid[0])
        else:
            ens.append(np.NaN)
        if protid:
            prot.append(protid[1])
        else:
            prot.append(np.NaN)
        
    return ens, prot, len(ens)

ensembl_df = pd.DataFrame({"Gene Name": [], "UniprotIso": [], "EnsemblID": [], "NumEns = NumIso": []})

for i in range(len(ensembl_list)):
    row = ensembl_list.loc[i]
    genename = row["Gene Name"]
    ensembls = row["Ensembl"]
    num_iso = row["NumIsoforms"]

    try:
        ens, prot, ens_len = separate_ensemblids(ensembls)
    except Exception as e:
        print(genename, e)
    
    new =  pd.DataFrame({"Gene Name": genename, "UniprotIso": prot, "EnsemblID": ens, "NumEns = NumIso": (ens_len == num_iso)})
    ensembl_df = pd.concat([ensembl_df, new])

ensembl_df.to_csv("ensembl_isoform.csv")


POLR2A can only join an iterable


In [315]:
ensembl_df

Unnamed: 0,Gene Name,UniprotIso,EnsemblID,NumEns = NumIso
0,BAF180,Q86U86-1,ENST00000296302.11,1.0
1,BAF180,Q86U86-5,ENST00000337303.8,1.0
2,BAF180,Q86U86-3,ENST00000356770.8,1.0
3,BAF180,Q86U86-4,ENST00000394830.7,1.0
4,BAF180,Q86U86-2,ENST00000409057.5,1.0
...,...,...,...,...
9,SMARCE1,Q969G3-4,ENST00000647508.1,False
0,SMARCAD1,Q9H4L7-1,ENST00000354268.9,False
1,SMARCAD1,Q9H4L7-2,ENST00000359052.8,False
2,SMARCAD1,Q9H4L7-2,ENST00000457823.6,False
