# 2-2-Transcription-factors-literature-2000-2009
Jakke Neiro$^1$
1. Aboobaker laboratory, Department of Zoology, University of Oxford

## Contents of notebook
* 1. Introduction
* 2. 2005
    * 2.1 Identification of genes needed for regeneration, stem cell function, and tissue homeostasis by systematic gene perturbation in planaria
* 3. 2006
* 4. 2007
* 5. 2008
    * 5.1 Expression of a retinal homeobox (Rx) gene during planarian regeneration
    * 5.2 Molecular analysis of stem cells and their descendents during cell turnover and regeneration in the planarian Schmidtea mediterranea
* 6. 2009
    * 6.1 Planarian Hh signaling regulates regeneration polarity and links Hh pathway evolution to cilia

## Files
* Input: fasta files
* Output: ID correspondence csv files

# 1. Introduction

The planarian literature on transcription factors between 2000 and 2009 was reviewed and the correspondence between TFs described in the literature and TFs decribed in this study was established. 

In [9]:
import pandas as pd
gffcmp = pd.read_csv("/hydra/sexual_genome_annotation_files/ncrna_Neiro/gffcmp.stringtie_merged.gtf.tmap", sep="\t")
gene2transcript = gffcmp.iloc[:,[0,3,4]]

In [6]:
def tf_table(blast_table, reference_name, dataframe):
    transcript_id = []
    gene_id_rink = []
    gene_id_neiro = []
    original_id = []
    original_name = []
    for i in range(len(blast_table)):
        t_id = blast_table.iloc[i,1]
        o_id = blast_table.iloc[i,0]
        transcript_id.append(t_id)
        gene_id_rink.append(gene2transcript[gene2transcript["qry_id"] == t_id].iloc[0,0])
        gene_id_neiro.append(gene2transcript[gene2transcript["qry_id"] == t_id].iloc[0,1])
        original_id.append(o_id.rpartition(".")[0])
        original_name.append(dataframe[dataframe["ID"] == o_id].iloc[0,1])
    source = [reference_name] * len(original_name)
    TF_table = pd.DataFrame({"Transcript.ID": transcript_id, "Rink.ID": gene_id_rink, "Neiro.ID": gene_id_neiro, "Original.ID": original_id, "Original.name": original_name, "Source": source})
    return TF_table

In [None]:
%%bash
cd /hydra/TF_data/
makeblastdb -in ~/FACS/stringtie_transcripts.fa -parse_seqids -blastdb_version 5 -dbtype nucl

# 2. 2005

## 2.1 Identification of genes needed for regeneration, stem cell function, and tissue homeostasis by systematic gene perturbation in planaria
Reddien, P.W., Bermange, A.L., Murfitt, K.J., Jennings, J.R. and Alvarado, A.S., 2005. Identification of genes needed for regeneration, stem cell function, and tissue homeostasis by systematic gene perturbation in planaria. Developmental cell, 8(5), pp.635-649.

### RNAi reduces regeneration, BLST(2–2.5), and perturbs behavior (n=11)
* NBE.5.04A	POU domain gene 50

### RNAi allows regeneration but perturbs photoreceptor formation (n=26)
* HE.2.05E	myocyte enhancing factor 2

### RNAi allows regeneration but causes abnormal behavior (n=25)
* NBE.6.12B	Transcription factor BTF3
* NBE.7.10A Zinc Finger Iguana/Dzip1 

### RNAi allows regeneration but causes other defects (n=27)
* NBE.3.07F	hunchback TXN factor

In [22]:
%%bash
cd /hydra/TF_data
grep ">" Reddien2005.fa

>AY967629.1 Schmidtea mediterranea clone NBE.5.04A mRNA sequence
>AY967495.1 Schmidtea mediterranea clone HE.2.05E mRNA sequence
>AY967662.1 Schmidtea mediterranea clone NBE.6.12B mRNA sequence
>AY967686.1 Schmidtea mediterranea clone NBE.7.10A mRNA sequence
>AY967596.1 Schmidtea mediterranea clone NBE.3.07F mRNA sequence


In [23]:
import pandas as pd
Reddien2005 = pd.DataFrame({"ID": ["AY967629.1", "AY967495.1", "AY967662.1", "AY967686.1", "AY967596.1"], "Name": ["POU50", "Mef2", "Btf3", "Iguana", "Hb"]})

In [24]:
%%bash
cd /hydra/TF_data/
blastn -db /hydra/FACS/stringtie_transcripts.fa -query Reddien2005.fa -out blast_Reddien2005 -outfmt 6

In [25]:
import pandas as pd
blast_Reddien2005 = pd.read_csv("/hydra/TF_data/blast_Reddien2005", sep="\t", header=None)
blast_Reddien2005.columns = ["Seqid", "qid", "Per", "Len", "Mis", "Gap", "Startq", "Endq", "Starts", "Ends", "E", "Bit"]
blast_Reddien2005 = blast_Reddien2005.loc[blast_Reddien2005.groupby("Seqid")["Bit"].idxmax()]

In [26]:
TF_table_Reddien2005 = tf_table(blast_Reddien2005, "Reddien et al. (2005)", Reddien2005)

In [27]:
TF_table_Reddien2005.to_csv("/hydra/TF_data/TF_Reddien2005.csv", index=False)

In [28]:
pd.read_csv("/hydra/TF_data/TF_Reddien2005.csv")

Unnamed: 0,Transcript.ID,Rink.ID,Neiro.ID,Original.ID,Original.name,Source
0,MSTRG.12930.3,SMESG000040479.1,MSTRG.12930,AY967495,Mef2,Reddien et al. (2005)
1,SMEST017121002.1,SMESG000017121.1,MSTRG.5364,AY967596,Hb,Reddien et al. (2005)
2,SMEST076173001.1,SMESG000076173.1,MSTRG.22352,AY967629,POU50,Reddien et al. (2005)
3,SMEST059040001.1,SMESG000059040.1,MSTRG.15727,AY967686,Iguana,Reddien et al. (2005)


# 3. 2006

# 4. 2007

# 5. 2008

## 5.1 Expression of a retinal homeobox (Rx) gene during planarian regeneration

Mannini, L., Deri, P., Picchi, J. and Batistoni, R., 2008. Expression of a retinal homeobox (Rx) gene during planarian regeneration. International Journal of Developmental Biology, 52(8), pp.1113-1117.

* AM942443

In [1]:
%%bash
cd /hydra/TF_data
grep ">" Mannini2008.fa

>AM942443.1 Schmidtea mediterranea mRNA for retinal homeobox protein Rx (Rx gene), asexual strain


In [2]:
import pandas as pd
Mannini2008 = pd.DataFrame({"ID": ["AM942443.1"], "Name": ["Rx"]})

In [3]:
%%bash
cd /hydra/TF_data/
blastn -db /hydra/FACS/stringtie_transcripts.fa -query Mannini2008.fa -out blast_Mannini2008 -outfmt 6

In [4]:
import pandas as pd
blast_Mannini2008 = pd.read_csv("/hydra/TF_data/blast_Mannini2008", sep="\t", header=None)
blast_Mannini2008.columns = ["Seqid", "qid", "Per", "Len", "Mis", "Gap", "Startq", "Endq", "Starts", "Ends", "E", "Bit"]
blast_Mannini2008 = blast_Mannini2008.loc[blast_Mannini2008.groupby("Seqid")["Bit"].idxmax()]

In [10]:
TF_table_Mannini2008 = tf_table(blast_Mannini2008, "Mannini et al. (2008)", Mannini2008)

In [11]:
TF_table_Mannini2008.to_csv("/hydra/TF_data/TF_Mannini2008.csv", index=False)

In [12]:
pd.read_csv("/hydra/TF_data/TF_Mannini2008.csv")

Unnamed: 0,Transcript.ID,Rink.ID,Neiro.ID,Original.ID,Original.name,Source
0,SMEST033843001.1,SMESG000033843.1,MSTRG.10612,AM942443,Rx,Mannini et al. (2008)


## 5.2 Molecular analysis of stem cells and their descendents during cell turnover and regeneration in the planarian Schmidtea mediterranea

Eisenhoffer, G.T., Kang, H. and Alvarado, A.S., 2008. Molecular analysis of stem cells and their descendants during cell turnover and regeneration in the planarian Schmidtea mediterranea. Cell stem cell, 3(3), pp.327-339.

We find that genes with severely downregulated expression after irradiation molecularly define at least three discrete subpopulations of cells.

# 6. 2009

## 6.1 Planarian Hh signaling regulates regeneration polarity and links Hh pathway evolution to cilia

Rink, J.C., Gurley, K.A., Elliott, S.A. and Alvarado, A.S., 2009. Planarian Hh signaling regulates regeneration polarity and links Hh pathway evolution to cilia. science, 326(5958), pp.1406-1410.

* We characterize Hh pathway in planarians
* Systematic sequence homology searching of the S. mediterranea genome identified three homologs for Gli. Of the Gli homologs, only Smed-gli-1 exhibited an obvious role in Hh signaling. 
* gli-1 expression in cells surrounding the gut enterocytes and particularly strong ptc upregulation upon sufu(RNAi) in the same region may indicate a conserved function of Hh in the gastrovascular system. 

In [1]:
import pandas as pd
Rink2009 = pd.DataFrame({"ID": ["GQ337478.1", "GQ337479.1", "GQ337480.1"], "Name": ["gli-1", "gli-2", "gli-3"]})

In [2]:
%%bash
cd /hydra/TF_data
grep ">" Rink2009.fa

>GQ337478.1 Schmidtea mediterranea GLI-1 (gli-1) mRNA, complete cds
>GQ337479.1 Schmidtea mediterranea GLI-2 (gli-2) mRNA, complete cds
>GQ337480.1 Schmidtea mediterranea GLI-3 (gli-3) mRNA, complete cds


In [None]:
%%bash
cd /hydra/TF_data/
blastn -db ~/FACS/stringtie_transcripts.fa -query Rink2009.fa -out blast_Rink2009 -outfmt 6

In [3]:
import pandas as pd
blast_Rink2009 = pd.read_csv("/hydra/TF_data/blast_Rink2009", sep="\t", header=None)
blast_Rink2009.columns = ["Seqid", "qid", "Per", "Len", "Mis", "Gap", "Startq", "Endq", "Starts", "Ends", "E", "Bit"]
blast_Rink2009 = blast_Rink2009.loc[blast_Rink2009.groupby("Seqid")["Bit"].idxmax()]

In [6]:
transcript_id = []
gene_id_rink = []
gene_id_neiro = []
original_id = []
original_name = []
for i in range(len(blast_Rink2009)):
    t_id = blast_Rink2009.iloc[i,1]
    o_id = blast_Rink2009.iloc[i,0]
    transcript_id.append(t_id)
    gene_id_rink.append(gene2transcript[gene2transcript["qry_id"] == t_id].iloc[0,0])
    gene_id_neiro.append(gene2transcript[gene2transcript["qry_id"] == t_id].iloc[0,1])
    original_id.append(o_id.rpartition(".")[0])
    original_name.append(Rink2009[Rink2009["ID"] == o_id].iloc[0,1])
source = ["Rink et al. (2009)"] * len(original_name)

In [7]:
TF_table_Rink2009 = pd.DataFrame({"Transcript.ID": transcript_id, "Rink.ID": gene_id_rink, "Neiro.ID": gene_id_neiro, "Original.ID": original_id, "Original.name": original_name, "Source": source})

In [None]:
TF_table_Rink2009.to_csv("/hydra/TF_data/TF_Rink2009.csv", index=False)

In [8]:
pd.read_csv("/hydra/TF_data/TF_Rink2009.csv")

Unnamed: 0,Transcript.ID,Rink.ID,Neiro.ID,Original.ID,Original.name,Source
0,MSTRG.20466.1,SMESG000073727.1,MSTRG.20466,GQ337478,gli-1,Rink et al. (2009)
1,SMEST060276001.1,SMESG000060276.1,MSTRG.17923,GQ337479,gli-2,Rink et al. (2009)
2,MSTRG.22990.2,SMESG000078300.1,MSTRG.22990,GQ337480,gli-3,Rink et al. (2009)


# FINNISHED