# Orthofinder Results

Identifying "interesting" orthogroups and looking at their interproscan results

In [2]:
%matplotlib inline

In [3]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

### Import Interproscan and Orthofinder results

In [4]:
interproscan_infn = "/home/jemimah/analysis/v3/interproscan/20200721_walli_all_protiens/Walli.v3.braker2.aa.both.tsv"
tsv_df = pd.read_csv(interproscan_infn, sep = '\t')

In [5]:
ortho = "/home/jemimah/analysis/v3/orthofinder/20200911/Results_Sep11/Orthogroups/Orthogroups.tsv"
ortho_species_df = pd.read_csv(ortho, sep = '\t')
ortho_species_df = ortho_species_df.set_index("Orthogroup")

ortho_count = "/home/jemimah/analysis/v3/orthofinder/20200911/Results_Sep11/Orthogroups/Orthogroups.GeneCount.tsv"
ortho_count_df = pd.read_csv(ortho_count, sep = '\t')
ortho_count_df = ortho_count_df.set_index("Orthogroup")

cols = ortho_count_df.columns.tolist()
ortho_species_df = ortho_species_df[cols[:-1]]

### Identify interesting results

So what I want to do is look through ortho_count_df for groups which are present in multiple species, but especially in *W ceracea* 

In [6]:
ortho_count_df.sort_values("W_ceracea", ascending=False ).iloc[0:120]

Unnamed: 0_level_0,A_thaliana,S_lycopersicum,S_tuberosum,A_chinensis,D_carota,L_sativa,H_annuus,W_ceracea,Total
Orthogroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
OG0000001,0,0,0,0,1,0,0,555,556
OG0000004,0,0,0,1,0,0,2,440,443
OG0000005,0,0,0,0,0,0,1,439,440
OG0000006,0,3,1,0,2,0,1,374,381
OG0000007,0,0,0,0,0,0,0,283,283
...,...,...,...,...,...,...,...,...,...
OG0000125,3,11,6,4,8,3,12,21,68
OG0002324,0,0,0,0,0,0,0,21,21
OG0002070,0,0,1,0,0,0,0,21,22
OG0002322,0,0,0,0,0,0,0,21,21


In [7]:
#"interesting" lists groups from
#ortho_count_df.sort_values("W_ceracea", ascending=False ).iloc[0:120]
#which contain genes from at least half of the non-W_ceracea species, preferably greater than 1,
#but with W ceracea having the greatest number.
#this does not apply to OG0000082 which has more H_annuus than W_ceracea
interesting = ["OG0000051", "OG0000107", "OG0000029", "OG0000130", "OG0000170", "OG0000208", "OG0000357",
               "OG0000339", "OG0000022", "OG0000062", "OG0000111", "OG0000647", "OG0000082", "OG0000019",
               "OG0000038", "OG0001315", "OG0000211", "OG0000190", "OG0000125"]


In [24]:
test_set = (ortho_count_df.sort_values("W_ceracea", ascending=False ).iloc[0:120])
test_set.loc[interesting]

Unnamed: 0_level_0,A_thaliana,S_lycopersicum,S_tuberosum,A_chinensis,D_carota,L_sativa,H_annuus,W_ceracea,Total
Orthogroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
OG0000051,1,0,1,4,0,1,8,80,95
OG0000107,1,2,1,0,0,0,6,62,72
OG0000029,0,18,26,16,0,4,3,49,116
OG0000130,1,1,1,1,4,2,16,41,67
OG0000170,1,3,2,5,3,2,2,41,59
OG0000208,0,1,1,2,2,6,4,38,54
OG0000357,1,3,0,1,0,0,2,36,43
OG0000339,1,1,1,1,2,1,1,36,44
OG0000022,19,16,8,2,9,24,22,31,131
OG0000062,2,6,16,5,2,6,23,30,90


In [25]:
ortho_species_df.loc[interesting]

Unnamed: 0_level_0,A_thaliana,S_lycopersicum,S_tuberosum,A_chinensis,D_carota,L_sativa,H_annuus,W_ceracea
Orthogroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
OG0000051,AT4G13320.1,,PGSC0003DMP400047901,"CEY00_Acc13534, CEY00_Acc18427, CEY00_Acc18692...",,Lsat_1_v5_gn_9_109260.1,"HanXRQChr02g0039061, HanXRQChr07g0190251, HanX...","file_1_file_1_g10292.t1, file_1_file_1_g11093...."
OG0000107,ATMG00750.1,"Solyc07g032120.1.1, Solyc10g045120.1.1",PGSC0003DMP400021359,,,,"HanXRQChr05g0135911, HanXRQChr05g0135921, HanX...","file_1_file_1_g11422.t1, file_1_file_1_g11423...."
OG0000029,,"Solyc02g036270.2.1, Solyc03g094100.1.1, Solyc0...","PGSC0003DMP400001764, PGSC0003DMP400005107, PG...","CEY00_Acc03815, CEY00_Acc03888, CEY00_Acc03889...",,"Lsat_1_v5_gn_0_10760.1, Lsat_1_v5_gn_1_83140.1...","HanXRQChr01g0017431, HanXRQChr09g0260361, HanX...","file_1_file_1_g11039.t1, file_1_file_1_g11040...."
OG0000130,AT1G45616.1,Solyc12g013730.1.1,PGSC0003DMP400038082,CEY00_Acc13584,"DCAR_012405, DCAR_012888, DCAR_012889, DCAR_01...","Lsat_1_v5_gn_7_98841.1, Lsat_1_v5_gn_7_98860.1","HanXRQChr04g0104781, HanXRQChr04g0104791, HanX...","file_1_file_1_g33714.t1, file_1_file_1_g35359...."
OG0000170,AT2G45400.1,"Solyc01g094070.2.1, Solyc05g051010.2.1, Solyc0...","PGSC0003DMP400012427, PGSC0003DMP400041037","CEY00_Acc07656, CEY00_Acc09041, CEY00_Acc09042...","DCAR_015897, DCAR_015898, DCAR_015899","Lsat_1_v5_gn_7_7060.1, Lsat_1_v5_gn_7_7100.1","HanXRQChr04g0122431, HanXRQChr04g0122591","file_1_file_1_g11001.t1, file_1_file_1_g11002...."
OG0000208,,Solyc04g078410.2.1,PGSC0003DMP400005946,"CEY00_Acc19869, CEY00_Acc23781","DCAR_008825, DCAR_012796","Lsat_1_v5_gn_1_23220.1, Lsat_1_v5_gn_2_43461.1...","HanXRQChr05g0150131, HanXRQChr09g0267331, HanX...","file_1_file_1_g81966.t1, file_1_file_1_g81969...."
OG0000357,AT3G29785.1,"Solyc00g005160.1.1, Solyc03g096270.1.1, Solyc0...",,CEY00_Acc18592,,,"HanXRQChr12g0374081, HanXRQChr15g0492251","file_1_file_1_g11488.t1, file_1_file_1_g11489...."
OG0000339,AT5G08490.3,Solyc10g005090.2.1,PGSC0003DMP400019952,CEY00_Acc20979,"DCAR_029997, DCAR_031633",Lsat_1_v5_gn_4_29081.1,HanXRQChr17g0564871,"file_1_file_1_g48656.t1, file_1_file_1_g48656...."
OG0000022,"AT1G33540.1, AT1G73270.1, AT1G73280.1, AT1G732...","Solyc04g076120.2.1, Solyc04g077630.2.1, Solyc0...","PGSC0003DMP400014101, PGSC0003DMP400018772, PG...","CEY00_Acc11305, CEY00_Acc12093","DCAR_009087, DCAR_021947, DCAR_021948, DCAR_02...","Lsat_1_v5_gn_1_118280.1, Lsat_1_v5_gn_1_118321...","HanXRQChr01g0007671, HanXRQChr01g0007681, HanX...","file_1_file_1_g16360.t1, file_1_file_1_g16362...."
OG0000062,"AT1G12615.1, AT1G12700.2","Solyc06g005220.2.1, Solyc06g007280.2.1, Solyc0...","PGSC0003DMP400006067, PGSC0003DMP400006070, PG...","CEY00_Acc04192, CEY00_Acc04196, CEY00_Acc17008...","DCAR_002545, DCAR_010216","Lsat_1_v5_gn_2_107460.1, Lsat_1_v5_gn_2_126841...","HanXRQChr07g0203521, HanXRQChr07g0203531, HanX...","file_1_file_1_g20522.t2, file_1_file_1_g20522...."


In [30]:
def ips_summary(orthogroup_list, disp="all"):
    for x in orthogroup_list:
        ortho_wlist = ortho_species_df.loc[x]["W_ceracea"].split(", ")
        ortho_all = tsv_df[tsv_df["Protein_accession"].isin(ortho_wlist)][["Protein_accession", "Contig", "SequenceMD5", "Analysis", "Signature_accession","Signature_description", "Interpro_annotation", "Interpro_description"]]
        ortho_unique = ortho_all.drop_duplicates(subset="Signature_accession")[["Analysis", "Signature_accession","Signature_description", "Interpro_annotation", "Interpro_description"]]
        counts = ortho_all.pivot_table(index=['Signature_accession'], aggfunc='size')
        ortho_unique["count"] = ortho_unique["Signature_accession"].map(counts.to_dict())
        ortho_unique = ortho_unique.sort_values("count", ascending = False )
        ortho_unique
#        print(x, "  \nWalli genes:", len(ortho_wlist), "  \nAll interpro results:", len(ortho_all),
#             "  \nUnique interpro results:", len(ortho_unique))
#for notebook copy-paste
        if disp == "summary":
            print("**" + x + "**  \nWalli genes:", len(ortho_wlist), "  \nAll interpro results:", len(ortho_all),
             "  \nUnique interpro results:", len(ortho_unique), "  ")
        elif disp == "table":
            print(x)
            display(HTML(ortho_unique.to_html()))
        elif disp == "all":
            print("**" + x + "**  \nWalli genes:", len(ortho_wlist), "  \nAll interpro results:", len(ortho_all),
             "  \nUnique interpro results:", len(ortho_unique), "  ")
            display(HTML(ortho_unique.to_html()))
        else:
            print("Please declare summary, table, or all")

**All interesting results:**

In [31]:
#ips_summary(interesting, "summary")

Interpret results:
- PANTHER accessions can be searched here http://www.pantherdb.org/panther/
- Gene3D: add the numbers to the end of this link http://www.cathdb.info/version/v4_2_0/superfamily/ eg http://www.cathdb.info/version/v4_2_0/superfamily/2.60.120.330
- Superfamily http://supfam.org/SUPERFAMILY/
- I generally just google the rest.

In [32]:
ips_summary(interesting, "all")

**OG0000051**  
Walli genes: 80   
All interpro results: 995   
Unique interpro results: 55   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
37861,MobiDBLite,mobidb-lite,consensus disorder prediction,,,233
37876,PANTHER,PTHR24559,-,,,92
37874,PANTHER,PTHR24559:SF290,-,,,91
58337,Coils,Coil,Coil,,,65
37857,Gene3D,G3DSA:3.30.70.270,-,IPR043128,Reverse transcriptase/Diguanylate cyclase domain,38
66704,Pfam,PF03732,Retrotransposon gag protein,IPR005162,Retrotransposon gag domain,26
37878,SUPERFAMILY,SSF56672,DNA/RNA polymerases,IPR043502,DNA/RNA polymerase superfamily,26
66706,SMART,SM00343,c2hcfinal6,IPR001878,"Zinc finger, CCHC-type",24
66708,SUPERFAMILY,SSF57756,Retrovirus zinc finger-like domains,IPR036875,"Zinc finger, CCHC-type superfamily",23
66700,ProSiteProfiles,PS50158,Zinc finger CCHC-type profile.,IPR001878,"Zinc finger, CCHC-type",23


**OG0000107**  
Walli genes: 62   
All interpro results: 660   
Unique interpro results: 33   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
34520,PANTHER,PTHR24559,-,,,67
34515,MobiDBLite,mobidb-lite,consensus disorder prediction,,,64
34523,PANTHER,PTHR24559:SF302,-,,,64
34522,Gene3D,G3DSA:3.30.70.270,-,IPR043128,Reverse transcriptase/Diguanylate cyclase domain,41
34514,SUPERFAMILY,SSF56672,DNA/RNA polymerases,IPR043502,DNA/RNA polymerase superfamily,27
5256,Gene3D,G3DSA:3.30.420.10,-,IPR036397,Ribonuclease H superfamily,25
18221,Pfam,PF03732,Retrotransposon gag protein,IPR005162,Retrotransposon gag domain,25
34519,Gene3D,G3DSA:3.10.20.370,-,,,23
58006,Gene3D,G3DSA:3.10.10.10,HIV Type 1 Reverse Transcriptase,,,23
34517,Coils,Coil,Coil,,,23


**OG0000029**  
Walli genes: 49   
All interpro results: 391   
Unique interpro results: 21   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
71047,PRINTS,PR00364,Disease resistance protein signature,,,61
25035,PANTHER,PTHR33463,-,,,58
25038,PANTHER,PTHR33463:SF21,-,,,55
25037,Gene3D,G3DSA:3.80.10.10,Ribonuclease Inhibitor,IPR032675,Leucine-rich repeat domain superfamily,50
25040,SUPERFAMILY,SSF52058,L domain-like,,,29
25043,Pfam,PF00931,NB-ARC domain,IPR002182,NB-ARC,23
25039,SUPERFAMILY,SSF52540,P-loop containing nucleoside triphosphate hydrolases,IPR027417,P-loop containing nucleoside triphosphate hydrolase,22
60638,Gene3D,G3DSA:3.40.50.300,-,,,20
54777,MobiDBLite,mobidb-lite,consensus disorder prediction,,,18
59350,Coils,Coil,Coil,,,15


**OG0000130**  
Walli genes: 41   
All interpro results: 758   
Unique interpro results: 31   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
21213,SMART,SM00369,LRR_typ_2,IPR003591,"Leucine-rich repeat, typical subtype",157
21205,Gene3D,G3DSA:3.80.10.10,Ribonuclease Inhibitor,IPR032675,Leucine-rich repeat domain superfamily,103
61141,ProSiteProfiles,PS51450,Leucine-rich repeat profile.,IPR001611,Leucine-rich repeat,88
50751,PANTHER,PTHR27004,-,,,59
21210,Pfam,PF13855,Leucine rich repeat,IPR001611,Leucine-rich repeat,56
21209,SUPERFAMILY,SSF52058,L domain-like,,,55
64376,PANTHER,PTHR27004:SF112,-,,,36
78024,PRINTS,PR00019,Leucine-rich repeat signature,,,34
21212,PANTHER,PTHR46662,-,,,30
21207,Pfam,PF08263,Leucine rich repeat N-terminal domain,IPR013210,"Leucine-rich repeat-containing N-terminal, plant-type",21


**OG0000170**  
Walli genes: 41   
All interpro results: 243   
Unique interpro results: 10   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
17917,Gene3D,G3DSA:3.40.50.720,-,,,43
17919,PANTHER,PTHR10366,NAD DEPENDENT EPIMERASE/DEHYDRATASE,,,43
17914,SUPERFAMILY,SSF51735,NAD(P)-binding Rossmann-fold domains,IPR036291,NAD(P)-binding domain superfamily,41
17918,PANTHER,PTHR10366:SF563,-,,,38
17915,Pfam,PF01370,NAD dependent epimerase/dehydratase family,IPR001509,NAD-dependent epimerase/dehydratase,37
17916,CDD,cd08958,FR_SDR_e,,,27
130030,MobiDBLite,mobidb-lite,consensus disorder prediction,,,6
133936,PANTHER,PTHR10366:SF611,PROTEIN BRI1-5 ENHANCED 1,,,5
283263,Pfam,PF01073,3-beta hydroxysteroid dehydrogenase/isomerase family,IPR002225,3-beta hydroxysteroid dehydrogenase/isomerase,2
407325,Pfam,PF07993,Male sterility protein,IPR013120,"Male sterility, NAD-binding",1


**OG0000208**  
Walli genes: 38   
All interpro results: 571   
Unique interpro results: 38   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
23220,Gene3D,G3DSA:1.10.510.10,Transferase(Phosphotransferase) domain 1,,,35
23234,SUPERFAMILY,SSF56112,Protein kinase-like (PK-like),IPR011009,Protein kinase-like domain superfamily,34
23230,ProSiteProfiles,PS50011,Protein kinase domain profile.,IPR000719,Protein kinase domain,33
1328,Gene3D,G3DSA:3.30.200.20,Phosphorylase Kinase; domain 1,,,32
23237,Pfam,PF00069,Protein kinase domain,IPR000719,Protein kinase domain,32
23218,PANTHER,PTHR27002:SF507,-,,,31
23231,PANTHER,PTHR27002,-,,,31
1330,CDD,cd01098,PAN_AP_plant,,,28
23219,ProSitePatterns,PS00108,Serine/Threonine protein kinases active-site signature.,IPR008271,"Serine/threonine-protein kinase, active site",28
23224,SMART,SM00220,serkin_6,IPR000719,Protein kinase domain,28


**OG0000357**  
Walli genes: 36   
All interpro results: 303   
Unique interpro results: 20   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
27427,MobiDBLite,mobidb-lite,consensus disorder prediction,,,45
143396,PANTHER,PTHR34676:SF1,-,,,26
143395,PANTHER,PTHR34676,-,,,26
27420,SUPERFAMILY,SSF57756,Retrovirus zinc finger-like domains,IPR036875,"Zinc finger, CCHC-type superfamily",18
143397,Pfam,PF14223,gag-polypeptide of LTR copia-type,,,18
27430,Gene3D,G3DSA:4.10.60.10,-,,,18
27429,ProSiteProfiles,PS50158,Zinc finger CCHC-type profile.,IPR001878,"Zinc finger, CCHC-type",17
27431,SMART,SM00343,c2hcfinal6,IPR001878,"Zinc finger, CCHC-type",17
27425,SUPERFAMILY,SSF53098,Ribonuclease H-like,IPR012337,Ribonuclease H-like superfamily,16
27426,Pfam,PF00098,Zinc knuckle,IPR001878,"Zinc finger, CCHC-type",15


**OG0000339**  
Walli genes: 36   
All interpro results: 806   
Unique interpro results: 6   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
1716,ProSiteProfiles,PS51375,Pentatricopeptide (PPR) repeat profile.,IPR002885,Pentatricopeptide repeat,311
1726,Pfam,PF01535,PPR repeat,IPR002885,Pentatricopeptide repeat,139
1749,PANTHER,PTHR24015,-,,,131
1720,Gene3D,G3DSA:1.25.40.10,-,IPR011990,Tetratricopeptide-like helical domain superfamily,111
1736,TIGRFAM,TIGR00756,PPR: pentatricopeptide repeat domain,IPR002885,Pentatricopeptide repeat,98
1717,Pfam,PF13041,PPR repeat family,IPR002885,Pentatricopeptide repeat,16


**OG0000022**  
Walli genes: 31   
All interpro results: 302   
Unique interpro results: 11   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
32662,PRINTS,PR00724,Carboxypeptidase C serine protease (S10) family signature,IPR001563,"Peptidase S10, serine carboxypeptidase",106
32659,PANTHER,PTHR11802,SERINE PROTEASE FAMILY S10 SERINE CARBOXYPEPTIDASE,IPR001563,"Peptidase S10, serine carboxypeptidase",33
32660,Pfam,PF00450,Serine carboxypeptidase,IPR001563,"Peptidase S10, serine carboxypeptidase",33
32666,SUPERFAMILY,SSF53474,alpha/beta-Hydrolases,IPR029058,Alpha/Beta hydrolase fold,32
32668,Gene3D,G3DSA:3.40.50.1820,-,IPR029058,Alpha/Beta hydrolase fold,32
32661,Gene3D,G3DSA:3.40.50.12670,-,,,26
32669,PANTHER,PTHR11802:SF221,-,,,21
90326,PANTHER,PTHR11802:SF29,SERINE CARBOXYPEPTIDASE-LIKE 11-RELATED,,,12
32667,ProSitePatterns,PS00560,"Serine carboxypeptidases, histidine active site.",IPR033124,"Serine carboxypeptidases, histidine active site",5
130008,Gene3D,G3DSA:3.40.50.11320,-,,,1


**OG0000062**  
Walli genes: 30   
All interpro results: 936   
Unique interpro results: 24   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
2801,ProSiteProfiles,PS51375,Pentatricopeptide (PPR) repeat profile.,IPR002885,Pentatricopeptide repeat,257
2790,TIGRFAM,TIGR00756,PPR: pentatricopeptide repeat domain,IPR002885,Pentatricopeptide repeat,238
2812,Gene3D,G3DSA:1.25.40.10,-,IPR011990,Tetratricopeptide-like helical domain superfamily,122
2822,Pfam,PF13041,PPR repeat family,IPR002885,Pentatricopeptide repeat,94
2800,PANTHER,PTHR46128,-,,,79
2826,Pfam,PF12854,PPR repeat,IPR002885,Pentatricopeptide repeat,45
129709,PANTHER,PTHR46128:SF10,-,,,21
45917,Pfam,PF01535,PPR repeat,IPR002885,Pentatricopeptide repeat,18
2804,SUPERFAMILY,SSF81901,HCP-like,,,14
45924,MobiDBLite,mobidb-lite,consensus disorder prediction,,,8


**OG0000111**  
Walli genes: 28   
All interpro results: 460   
Unique interpro results: 23   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
28718,PRINTS,PR00364,Disease resistance protein signature,,,60
28715,Gene3D,G3DSA:3.80.10.10,Ribonuclease Inhibitor,IPR032675,Leucine-rich repeat domain superfamily,57
28713,PANTHER,PTHR11017,LEUCINE-RICH REPEAT-CONTAINING PROTEIN,,,53
28727,PANTHER,PTHR11017:SF298,-,,,47
28722,SUPERFAMILY,SSF52058,L domain-like,,,29
28724,Gene3D,G3DSA:3.40.50.10140,-,IPR035897,Toll/interleukin-1 receptor homology (TIR) domain superfamily,26
28726,Pfam,PF01582,TIR domain,IPR000157,Toll/interleukin-1 receptor homology (TIR) domain,24
28725,SUPERFAMILY,SSF52200,Toll/Interleukin receptor TIR domain,IPR035897,Toll/interleukin-1 receptor homology (TIR) domain superfamily,24
28732,SMART,SM00255,till_3,IPR000157,Toll/interleukin-1 receptor homology (TIR) domain,22
28712,ProSiteProfiles,PS50104,TIR domain profile.,IPR000157,Toll/interleukin-1 receptor homology (TIR) domain,22


**OG0000647**  
Walli genes: 28   
All interpro results: 388   
Unique interpro results: 27   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
14399,PRINTS,PR00364,Disease resistance protein signature,,,70
14397,PANTHER,PTHR11017,LEUCINE-RICH REPEAT-CONTAINING PROTEIN,,,34
68262,ProSiteProfiles,PS51450,Leucine-rich repeat profile.,IPR001611,Leucine-rich repeat,33
14404,PANTHER,PTHR11017:SF301,-,,,31
68275,Gene3D,G3DSA:3.80.10.10,Ribonuclease Inhibitor,IPR032675,Leucine-rich repeat domain superfamily,25
14403,SUPERFAMILY,SSF52540,P-loop containing nucleoside triphosphate hydrolases,IPR027417,P-loop containing nucleoside triphosphate hydrolase,22
14396,Gene3D,G3DSA:3.40.50.300,-,,,21
68264,SUPERFAMILY,SSF52058,L domain-like,,,19
14395,Pfam,PF00931,NB-ARC domain,IPR002182,NB-ARC,18
14402,Gene3D,G3DSA:3.40.50.10140,-,IPR035897,Toll/interleukin-1 receptor homology (TIR) domain superfamily,18


**OG0000082**  
Walli genes: 26   
All interpro results: 157   
Unique interpro results: 6   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
3,Gene3D,G3DSA:3.30.559.10,Chloramphenicol Acetyltransferase,IPR023213,Chloramphenicol acetyltransferase-like domain superfamily,50
2,Pfam,PF02458,Transferase family,IPR003480,Transferase,35
5,PANTHER,PTHR31623,-,,,35
6,PANTHER,PTHR31623:SF18,-,,,33
185113,Coils,Coil,Coil,,,3
509849,PANTHER,PTHR31623:SF6,-,,,1


**OG0000019**  
Walli genes: 25   
All interpro results: 244   
Unique interpro results: 20   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
84613,PANTHER,PTHR27009,-,,,41
84618,PANTHER,PTHR27009:SF111,-,,,26
84615,Gene3D,G3DSA:1.10.510.10,Transferase(Phosphotransferase) domain 1,,,26
84617,SUPERFAMILY,SSF56112,Protein kinase-like (PK-like),IPR011009,Protein kinase-like domain superfamily,22
84614,ProSiteProfiles,PS50011,Protein kinase domain profile.,IPR000719,Protein kinase domain,21
84612,Pfam,PF00069,Protein kinase domain,IPR000719,Protein kinase domain,17
84621,Gene3D,G3DSA:3.30.200.20,Phosphorylase Kinase; domain 1,,,16
84611,SMART,SM00220,serkin_6,IPR000719,Protein kinase domain,15
84610,ProSitePatterns,PS00108,Serine/Threonine protein kinases active-site signature.,IPR008271,"Serine/threonine-protein kinase, active site",15
107972,PANTHER,PTHR27009:SF124,-,,,12


**OG0000038**  
Walli genes: 25   
All interpro results: 93   
Unique interpro results: 7   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
64626,Pfam,PF03087,Arabidopsis protein of unknown function,IPR004320,"Protein of unknown function DUF241, plant",25
64624,Coils,Coil,Coil,,,18
87731,PANTHER,PTHR31509,-,,,17
87732,PANTHER,PTHR31509:SF71,-,,,17
64623,PANTHER,PTHR33070,-,,,8
64627,PANTHER,PTHR33070:SF3,EXPRESSED PROTEIN,,,7
482677,PANTHER,PTHR33070:SF75,SELECTION/UPKEEP OF INTRAEPITHELIAL T-CELLS PROTEIN,,,1


**OG0001315**  
Walli genes: 22   
All interpro results: 144   
Unique interpro results: 18   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
20110,Gene3D,G3DSA:3.80.10.10,Ribonuclease Inhibitor,IPR032675,Leucine-rich repeat domain superfamily,20
44840,PANTHER,PTHR32212,-,,,19
44834,SUPERFAMILY,SSF81383,F-box domain,IPR036047,F-box-like domain superfamily,16
44836,ProSiteProfiles,PS50181,F-box domain profile.,IPR001810,F-box domain,16
44838,Pfam,PF00646,F-box domain,IPR001810,F-box domain,15
44837,PANTHER,PTHR32212:SF260,-,,,12
44841,SMART,SM00256,fbox_2,IPR001810,F-box domain,11
44835,SUPERFAMILY,SSF52058,L domain-like,,,10
319709,PANTHER,PTHR32212:SF288,-,,,7
20105,SUPERFAMILY,SSF52047,RNI-like,,,7


**OG0000211**  
Walli genes: 22   
All interpro results: 186   
Unique interpro results: 6   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
42801,SMART,SM00733,mt_12,IPR003690,"Transcription termination factor, mitochondrial/chloroplastic",87
42799,Pfam,PF02536,mTERF,IPR003690,"Transcription termination factor, mitochondrial/chloroplastic",27
42805,PANTHER,PTHR13068,CGI-12 PROTEIN-RELATED,,,25
42800,PANTHER,PTHR13068:SF127,-,,,23
42798,Gene3D,G3DSA:1.25.70.10,-,IPR038538,"MTERF superfamily, mitochondrial/chloroplastic",22
517552,PANTHER,PTHR13068:SF138,-,,,2


**OG0000190**  
Walli genes: 21   
All interpro results: 502   
Unique interpro results: 31   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
28545,PANTHER,PTHR32444:SF41,-,,,23
28546,PANTHER,PTHR32444,-,,,23
28555,Pfam,PF08276,PAN-like domain,IPR003609,PAN/Apple domain,22
28558,ProSiteProfiles,PS50927,Bulb-type lectin domain profile.,IPR001480,Bulb-type lectin domain,22
28559,CDD,cd00028,B_lectin,IPR001480,Bulb-type lectin domain,22
28541,Gene3D,G3DSA:3.30.200.20,Phosphorylase Kinase; domain 1,,,22
28552,Gene3D,G3DSA:2.90.10.10,Agglutinin,IPR036426,Bulb-type lectin domain superfamily,22
28550,Pfam,PF00954,S-locus glycoprotein domain,IPR000858,S-locus glycoprotein domain,22
28557,CDD,cd01098,PAN_AP_plant,,,22
28547,SMART,SM00108,blect_4,IPR001480,Bulb-type lectin domain,22


**OG0000125**  
Walli genes: 21   
All interpro results: 123   
Unique interpro results: 8   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
23004,MobiDBLite,mobidb-lite,consensus disorder prediction,,,57
23005,PANTHER,PTHR12663,ANDROGEN INDUCED INHIBITOR OF PROLIFERATION AS3 / PDS5-RELATED,IPR039776,Sister chromatid cohesion protein Pds5,25
23008,PANTHER,PTHR12663:SF3,TRANSCRIPTIONAL REGULATOR,,,12
23006,SUPERFAMILY,SSF48371,ARM repeat,IPR016024,Armadillo-type fold,10
243390,PANTHER,PTHR12663:SF25,TUDOR/PWWP/MBT SUPERFAMILY PROTEIN,,,10
103840,SUPERFAMILY,SSF63748,Tudor/PWWP/MBT,,,4
103849,PANTHER,PTHR12663:SF21,-,,,3
247536,Gene3D,G3DSA:2.30.30.140,-,,,2


## Protien Domains
**OG0000051** - seems likely to be TE  
Walli genes: 80   
All interpro results: 995   
Unique interpro results: 55  
- two most popular Panther results are "viral or transposable element protein"
- viral keeps coming up (as well as the ones that say it there, http://www.cathdb.info/version/latest/superfamily/3.30.70.270 for Gene3D)  

**OG0000107** - seems likely to be TE  
Walli genes: 62   
All interpro results: 660   
Unique interpro results: 33  
- same reasons as the one above 

**OG0000029** - disease resistance  
Walli genes: 49   
All interpro results: 391   
Unique interpro results: 21  
- Prints result is for disease resistence (hard to find more info on, though)
- PTHR33463:SF21 can't be found??? but PTHR33463 is a family with 973 genes, some of the subfamilies being disease resistence http://www.pantherdb.org/list/list.do?numPerPage=100&save=yes&searchModType=numperpage&listType=6

**OG0000130** - Strong support for a Leucine-rich repeat, but that doesnt seem to be informative in and of itself  
Walli genes: 41   
All interpro results: 758   
Unique interpro results: 31  
- panther results http://www.pantherdb.org/panther/family.do?clsAccession=PTHR27004:SF112,  http://www.pantherdb.org/genes/geneList.do?searchType=basic&fieldName=all&listType=1&fieldValue=PTHR27004&organism=all are receptors and/or leucine rich http://www.pantherdb.org/panther/familyList.do?searchType=basic&fieldName=all&organism=all&listType=6&fieldValue=PTHR46662

**OG0000170** - NAD dependent something  
Walli genes: 41   
All interpro results: 243   
Unique interpro results: 10  
- Gene3D: http://www.cathdb.info/version/latest/superfamily/3.40.50.720
- Panther: http://www.pantherdb.org/panther/family.do?clsAccession=PTHR10366:SF563
- CDD: https://www.ncbi.nlm.nih.gov/Structure/cdd/cddsrv.cgi?uid=cd08958 - appears to act in the reduction of flavinoids (more info in link)

**OG0000208** - Protein Kinases  
Walli genes: 38   
All interpro results: 571   
Unique interpro results: 38  
- Panther is also Protein Kinases

**OG0000357** -   Really uncertain
Walli genes: 36   
All interpro results: 303   
Unique interpro results: 20   
- Panther is Zinc finger

**OG0000339** - Pentatricopeptide repeat  
Walli genes: 36   
All interpro results: 806   
Unique interpro results: 6  
- RNA level gene expression regulation (https://www.sciencedirect.com/science/article/pii/S030090841500108X)  

**OG0000022** - Peptidase S10, serine carboxypeptidase   
Walli genes: 31   
All interpro results: 302   
Unique interpro results: 11  
- Calalytic something or other? (https://www.ebi.ac.uk/interpro/entry/InterPro/IPR001563/)

**OG0000062** - Pentatricopeptide (PPR) repeat profile  
Walli genes: 30    
All interpro results: 936   
Unique interpro results: 24  
- "Most of PPR proteins have roles in mitochondria or plastid" (https://prosite.expasy.org/PDOC51375)  

**OG0000111** - Maybe disease resistance?   
Walli genes: 28    
All interpro results: 460   
Unique interpro results: 23  
- Seems to be a mixture of results I'vve already had?
    - PRINTS reports desiease resistence
    - Gene3D. Panther, others report Leucine-rich repeat domain
    - Toll/interleukin results seem to be anti bacterial/fungal (incl in plant cytoplasmic host defence https://www.ebi.ac.uk/interpro/entry/InterPro/IPR000157/)

**OG0000647** - very simillar to the one above   
Walli genes: 28   
All interpro results: 388   
Unique interpro results: 27  
- might actuaklly be the same results (with different counts)
- if they're so similalr, why are they two different orthogroups?

**OG0000082** -   
Walli genes: 26   
All interpro results: 157   
Unique interpro results: 6  
- Gene3D appears to be antibiotic resistence? which isn't useful surely (https://www.ebi.ac.uk/interpro/entry/InterPro/IPR023213/)
- seems to be a big pfam family: https://pfam.xfam.org/family/Transferase
- Panther: http://www.pantherdb.org/panther/family.do?clsAccession=PTHR31623:SF18

**OG0000019** -  Protein kinases, maybe rust resistent?
Walli genes: 25   
All interpro results: 244   
Unique interpro results: 20  
- http://www.pantherdb.org/panther/family.do?clsAccession=PTHR27009: RUST RESISTANCE KINASE LR10-RELATED
    - cannot find how it is rust resistant??
    
**OG0000038** - Very uncertain    
Walli genes: 25   
All interpro results: 93   
Unique interpro results: 7  
- Panther has it as a BPS1-LIKE PROTEIN (http://www.pantherdb.org/panther/family.do?clsAccession=PTHR31509), which is described here: https://www.ncbi.nlm.nih.gov/gene?Db=gene&Cmd=DetailsSearch&Term=839536 as being inhibiting of auxing signalling
- and also as this other unknown rice gene? http://www.pantherdb.org/panther/family.do?clsAccession=PTHR33070; https://www.uniprot.org/uniprot/Q5Z978

**OG0001315** - interacts with other protiens?   
Walli genes: 22   
All interpro results: 144   
Unique interpro results: 18  
- F-box domain and Leucine rich domain
    - "F-box domains commonly exist in proteins in concert with other protein–protein interaction motifs such as leucine-rich repeats" at least my result matches wikipedia (https://en.wikipedia.org/wiki/F-box_protein)?
    - so seems to suggest protien-protien interactions, although of an unknown sort
- idk if the FAD binding domain matches are useful because they are pretty infrequent

**OG0000211** - Mitochondira/chloroplast   
Walli genes: 22   
All interpro results: 186   
Unique interpro results: 6  
- seems smooth match? 
- although the panther result just seems to be unknown: http://www.pantherdb.org/panther/family.do?clsAccession=PTHR13068

**OG0000190** -    
Walli genes: 21   
All interpro results: 502   
Unique interpro results: 31  
- Panther domain is unclear
- I really dont understand what the pan/apple domain is: https://www.ebi.ac.uk/interpro/entry/InterPro/IPR003609/
- for the bulb-lectin one: https://prosite.expasy.org/doc/PS50927
- the fact that there is almost exactly the same count of a bunch of different things makes me sure they are all realated results. I just dont know what they're meaning.

**OG0000125** -   
Walli genes: 21  
All interpro results: 123   
Unique interpro results: 8  
- Panther result suggets DNA repair and chromatin-chromatin binding (http://www.pantherdb.org/panther/family.do?clsAccession=PTHR12663)
- the TUDOR/PWWP/MBT SUPERFAMILY PROTEIN stuff also seems to be about chromatid cohesion (https://www.uniprot.org/uniprot/A8MRD9-1 - couldnt find the exact right one)
- have't managed to figure out what an ARM repeat is (http://supfam.org/SUPERFAMILY/cgi-bin/scop.cgi?ipid=SSF48371)

### Checking for more

Looking to see if there are any more interesting results I haven't seen yet

In [33]:
ortho_count_df["prop"] = ortho_count_df["W_ceracea"]/ortho_count_df["Total"]

In [34]:
ortho_count_df.loc[interesting].sort_values("prop", ascending=False )

Unnamed: 0_level_0,A_thaliana,S_lycopersicum,S_tuberosum,A_chinensis,D_carota,L_sativa,H_annuus,W_ceracea,Total,prop
Orthogroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
OG0000107,1,2,1,0,0,0,6,62,72,0.861111
OG0000647,1,1,1,2,0,0,0,28,33,0.848485
OG0001315,0,0,0,1,1,1,1,22,26,0.846154
OG0000051,1,0,1,4,0,1,8,80,95,0.842105
OG0000357,1,3,0,1,0,0,2,36,43,0.837209
OG0000339,1,1,1,1,2,1,1,36,44,0.818182
OG0000208,0,1,1,2,2,6,4,38,54,0.703704
OG0000170,1,3,2,5,3,2,2,41,59,0.694915
OG0000130,1,1,1,1,4,2,16,41,67,0.61194
OG0000029,0,18,26,16,0,4,3,49,116,0.422414


In [35]:
not_yet_df = ortho_count_df[~ortho_count_df.index.isin(interesting)]

In [36]:
not_yet_df[not_yet_df["prop"] != 1.0].sort_values("prop", ascending=False).iloc[420:450]

Unnamed: 0_level_0,A_thaliana,S_lycopersicum,S_tuberosum,A_chinensis,D_carota,L_sativa,H_annuus,W_ceracea,Total,prop
Orthogroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
OG0022217,0,0,1,0,0,0,0,2,3,0.666667
OG0020384,0,0,0,0,1,0,0,2,3,0.666667
OG0022219,0,0,1,0,0,0,0,2,3,0.666667
OG0022221,0,0,1,0,0,0,0,2,3,0.666667
OG0019983,0,0,0,1,0,0,0,2,3,0.666667
OG0016768,0,1,1,0,0,0,0,4,6,0.666667
OG0000724,1,1,1,2,1,3,2,21,32,0.65625
OG0003558,2,0,0,2,1,0,1,11,17,0.647059
OG0003647,0,1,1,1,1,1,1,11,17,0.647059
OG0003644,0,0,0,1,3,1,1,11,17,0.647059


In [37]:
#"interesting_2" lists groups from
#not_yet_df[not_yet_df["prop"] != 1.0].sort_values("prop", ascending=False)
#where not_yet_df = ortho_count_df[~ortho_count_df.index.isin(interesting)]
# which contain genes from at least half of the non-W_ceracea species,
# with W ceracea having a propotion of the total genes > 0.625
interesting_2 = ["OG0001651", "OG0004401", "OG0003049", "OG0001024", "OG0002516", "OG0001842", 
                 "OG0001075", "OG0002249", "OG0003266", "OG0004831", "OG0000724", "OG0003558", 
                 "OG0003647", "OG0003644", "OG0000769", "OG0005358", "OG0001077", "OG0002833", 
                 "OG0002836"]

In [38]:
ips_summary(interesting_2, "all")

**OG0001651**  
Walli genes: 19   
All interpro results: 170   
Unique interpro results: 21   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
3552,SMART,SM00248,ANK_2a,IPR002110,Ankyrin repeat,52
3550,SUPERFAMILY,SSF48403,Ankyrin repeat,IPR036770,Ankyrin repeat-containing domain superfamily,17
3557,ProSiteProfiles,PS50297,Ankyrin repeat region circular profile.,IPR020683,Ankyrin repeat-containing domain,16
3549,PANTHER,PTHR24177,-,,,15
3559,Gene3D,G3DSA:1.25.40.20,-,IPR036770,Ankyrin repeat-containing domain superfamily,12
3556,Pfam,PF12796,Ankyrin repeats (3 copies),IPR020683,Ankyrin repeat-containing domain,11
3558,Pfam,PF13962,Domain of unknown function,IPR026961,PGG domain,11
37319,PANTHER,PTHR24177:SF239,-,,,7
3551,PANTHER,PTHR24177:SF180,-,,,6
37318,Pfam,PF14244,gag-polypeptide of LTR copia-type,IPR029472,"Retrotransposon Copia-like, N-terminal",5


**OG0004401**  
Walli genes: 11   
All interpro results: 58   
Unique interpro results: 13   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
22042,SUPERFAMILY,SSF52540,P-loop containing nucleoside triphosphate hydrolases,IPR027417,P-loop containing nucleoside triphosphate hydrolase,10
22043,Gene3D,G3DSA:3.40.50.300,-,,,9
22044,PANTHER,PTHR19241,ATP-BINDING CASSETTE TRANSPORTER,,,9
22040,PANTHER,PTHR19241:SF605,-,,,7
22041,Pfam,PF00005,ABC transporter,IPR003439,ABC transporter-like,5
43275,Gene3D,G3DSA:1.10.580.10,Citrate Synthase,IPR016142,"Citrate synthase-like, large alpha subdomain",5
43281,SUPERFAMILY,SSF48256,Citrate synthase,IPR036969,Citrate synthase superfamily,5
148016,PANTHER,PTHR11739,CITRATE SYNTHASE,IPR002020,Citrate synthase,2
235239,Coils,Coil,Coil,,,2
121025,PANTHER,PTHR19241:SF564,-,,,1


**OG0003049**  
Walli genes: 13   
All interpro results: 79   
Unique interpro results: 9   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
214700,Gene3D,G3DSA:2.60.120.330,-,IPR027443,Isopenicillin N synthase-like,14
214701,PANTHER,PTHR10209,"OXIDOREDUCTASE, 2OG-FE II OXYGENASE FAMILY PROTEIN",,,14
214698,SUPERFAMILY,SSF51197,Clavaminate synthase-like,,,13
214699,PANTHER,PTHR10209:SF687,-,,,11
222647,Pfam,PF03171,2OG-Fe(II) oxygenase superfamily,IPR005123,Oxoglutarate/iron-dependent dioxygenase,9
222649,ProSiteProfiles,PS51471,Fe(2+) 2-oxoglutarate dioxygenase domain profile.,IPR005123,Oxoglutarate/iron-dependent dioxygenase,8
222652,Pfam,PF14226,non-haem dioxygenase in morphine synthesis N-terminal,IPR026992,Non-haem dioxygenase N-terminal domain,7
697221,PANTHER,PTHR10209:SF451,-,,,2
410550,PANTHER,PTHR10209:SF508,-,,,1


**OG0001024**  
Walli genes: 20   
All interpro results: 100   
Unique interpro results: 14   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
6341,Coils,Coil,Coil,,,28
6342,PANTHER,PTHR13815,GOLGIN-84,IPR019177,Golgin subfamily A member 5,22
6344,PANTHER,PTHR13815:SF7,GOLGIN CANDIDATE 1,,,22
6345,MobiDBLite,mobidb-lite,consensus disorder prediction,,,15
36604,Pfam,PF09787,Golgin subfamily A member 5,IPR019177,Golgin subfamily A member 5,4
446471,PANTHER,PTHR43671,-,,,1
446472,Pfam,PF00069,Protein kinase domain,IPR000719,Protein kinase domain,1
446473,ProSitePatterns,PS00108,Serine/Threonine protein kinases active-site signature.,IPR008271,"Serine/threonine-protein kinase, active site",1
446474,Gene3D,G3DSA:3.30.200.20,Phosphorylase Kinase; domain 1,,,1
446475,PANTHER,PTHR43671:SF49,-,,,1


**OG0002516**  
Walli genes: 14   
All interpro results: 76   
Unique interpro results: 11   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
20798,MobiDBLite,mobidb-lite,consensus disorder prediction,,,32
65909,PANTHER,PTHR46033,-,,,10
65910,PANTHER,PTHR46033:SF16,"AMINOTRANSFERASE-LIKE, MOBILE DOMAIN PROTEIN-RELATED",,,10
108595,Pfam,PF10536,Plant mobile domain,IPR019557,"Aminotransferase-like, plant mobile domain",7
108592,Coils,Coil,Coil,,,4
282365,PANTHER,PTHR11764,TERPENE CYCLASE/MUTASE FAMILY MEMBER,,,3
282367,SUPERFAMILY,SSF48239,Terpenoid cyclases/Protein prenyltransferases,IPR008930,Terpenoid cyclases/protein prenyltransferase alpha-alpha toroid,3
311949,Gene3D,G3DSA:1.50.10.20,-,,,2
311951,Pfam,PF13243,Squalene-hopene cyclase C-terminal domain,IPR032696,"Squalene cyclase, C-terminal",2
311952,PANTHER,PTHR11764:SF48,-,,,2


**OG0001842**  
Walli genes: 16   
All interpro results: 95   
Unique interpro results: 12   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
25307,PANTHER,PTHR45648:SF27,-,,,17
25309,PANTHER,PTHR45648,-,,,17
25308,Gene3D,G3DSA:3.40.50.1110,-,IPR036514,SGNH hydrolase superfamily,16
25306,Pfam,PF00657,GDSL-like Lipase/Acylhydrolase,IPR001087,GDSL lipase/esterase,15
25310,CDD,cd01837,SGNH_plant_lipase_like,IPR035669,"GDSL lipase/esterase-like, plant",10
25305,SUPERFAMILY,SSF52266,SGNH hydrolase,,,4
296182,SUPERFAMILY,SSF52540,P-loop containing nucleoside triphosphate hydrolases,IPR027417,P-loop containing nucleoside triphosphate hydrolase,4
296183,SUPERFAMILY,SSF46785,Winged helix DNA-binding domain,IPR036390,Winged helix DNA-binding domain superfamily,3
375462,PRINTS,PR00364,Disease resistance protein signature,,,3
375460,Gene3D,G3DSA:1.10.8.430,-,,,2


**OG0001075**  
Walli genes: 19   
All interpro results: 52   
Unique interpro results: 3   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
21887,MobiDBLite,mobidb-lite,consensus disorder prediction,,,25
21889,PANTHER,PTHR35468,-,,,18
54791,Coils,Coil,Coil,,,9


**OG0002249**  
Walli genes: 14   
All interpro results: 26   
Unique interpro results: 10   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
206536,MobiDBLite,mobidb-lite,consensus disorder prediction,,,4
599118,PANTHER,PTHR10698,V-TYPE PROTON ATPASE SUBUNIT H,IPR004908,"ATPase, V1 complex, subunit H",4
599120,PANTHER,PTHR10698:SF2,-,,,4
599122,Pfam,PF03224,V-ATPase subunit H,IPR004908,"ATPase, V1 complex, subunit H",4
599117,SUPERFAMILY,SSF48371,ARM repeat,IPR016024,Armadillo-type fold,3
599115,Pfam,PF11698,V-ATPase subunit H,IPR011987,"ATPase, V1 complex, subunit H, C-terminal",2
599116,Gene3D,G3DSA:1.25.10.10,-,IPR011989,Armadillo-like helical,2
118395,PANTHER,PTHR10868,SIGMA 1-TYPE OPIOID RECEPTOR-RELATED,IPR006716,ERG2/sigma1 receptor-like,1
118396,Pfam,PF04622,ERG2 and Sigma1 receptor like protein,IPR006716,ERG2/sigma1 receptor-like,1
638100,Gene3D,G3DSA:1.25.40.150,-,IPR038497,"ATPase, V1 complex, subunit H, C-terminal domain superfamily",1


**OG0003266**  
Walli genes: 12   
All interpro results: 99   
Unique interpro results: 8   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
18539,Gene3D,G3DSA:2.40.70.10,Acid Proteases,IPR021109,Aspartic peptidase domain superfamily,22
18540,PANTHER,PTHR13683,ASPARTYL PROTEASES,IPR001461,Aspartic peptidase A1 family,14
18543,PANTHER,PTHR13683:SF399,-,,,14
18541,SUPERFAMILY,SSF50630,Acid proteases,IPR021109,Aspartic peptidase domain superfamily,12
18542,Pfam,PF14543,Xylanase inhibitor N-terminal,IPR032861,"Xylanase inhibitor, N-terminal",12
25362,Pfam,PF14541,Xylanase inhibitor C-terminal,IPR032799,"Xylanase inhibitor, C-terminal",10
25364,ProSiteProfiles,PS51767,Peptidase family A1 domain profile.,IPR033121,Peptidase family A1 domain,9
68503,CDD,cd05489,xylanase_inhibitor_I_like,IPR033868,Xylanase inhibitor I-like,6


**OG0004831**  
Walli genes: 10   
All interpro results: 124   
Unique interpro results: 8   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
12143,ProSiteProfiles,PS51375,Pentatricopeptide (PPR) repeat profile.,IPR002885,Pentatricopeptide repeat,46
12151,TIGRFAM,TIGR00756,PPR: pentatricopeptide repeat domain,IPR002885,Pentatricopeptide repeat,20
12144,Pfam,PF01535,PPR repeat,IPR002885,Pentatricopeptide repeat,17
12149,Gene3D,G3DSA:1.25.40.10,-,IPR011990,Tetratricopeptide-like helical domain superfamily,15
12165,PANTHER,PTHR24015,-,,,13
12164,Pfam,PF13041,PPR repeat family,IPR002885,Pentatricopeptide repeat,9
186866,Pfam,PF14432,DYW family of nucleic acid deaminases,IPR032867,DYW domain,3
158155,Pfam,PF12854,PPR repeat,IPR002885,Pentatricopeptide repeat,1


**OG0000724**  
Walli genes: 21   
All interpro results: 147   
Unique interpro results: 10   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
4483,PANTHER,PTHR32246:SF69,CALCIUM-DEPENDENT LIPID-BINDING (CALB DOMAIN) FAMILY PROTEIN,,,21
4487,PANTHER,PTHR32246,-,,,21
80812,MobiDBLite,mobidb-lite,consensus disorder prediction,,,17
4484,Gene3D,G3DSA:2.60.40.150,-,IPR035892,C2 domain superfamily,16
4485,Pfam,PF00168,C2 domain,IPR000008,C2 domain,16
4486,SUPERFAMILY,SSF49562,"C2 domain (Calcium/lipid-binding domain, CaLB)",,,16
4489,ProSiteProfiles,PS50004,C2 domain profile.,IPR000008,C2 domain,15
4488,SMART,SM00239,C2_3c,IPR000008,C2 domain,14
122976,CDD,cd04051,C2_SRC2_like,,,10
263720,Coils,Coil,Coil,,,1


**OG0003558**  
Walli genes: 11   
All interpro results: 64   
Unique interpro results: 12   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
481876,PRINTS,PR00738,Glycosyl hydrolase family 20 signature,IPR025705,Beta-hexosaminidase,12
89908,Pfam,PF07899,Frigida-like protein,IPR012474,Frigida-like,9
89907,PANTHER,PTHR31791:SF47,FRIGIDA-LIKE PROTEIN 1,,,8
89909,PANTHER,PTHR31791,-,,,8
171674,Coils,Coil,Coil,,,8
171676,MobiDBLite,mobidb-lite,consensus disorder prediction,,,3
481875,Gene3D,G3DSA:3.20.20.80,Glycosidases,,,3
481880,PANTHER,PTHR22600,BETA-HEXOSAMINIDASE,,,3
481881,SUPERFAMILY,SSF51445,(Trans)glycosidases,IPR017853,Glycoside hydrolase superfamily,3
481882,Pfam,PF00728,"Glycosyl hydrolase family 20, catalytic domain",IPR015883,"Glycoside hydrolase family 20, catalytic domain",3


**OG0003647**  
Walli genes: 11   
All interpro results: 100   
Unique interpro results: 6   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
189101,ProSiteProfiles,PS51375,Pentatricopeptide (PPR) repeat profile.,IPR002885,Pentatricopeptide repeat,29
92712,Pfam,PF01535,PPR repeat,IPR002885,Pentatricopeptide repeat,20
92716,TIGRFAM,TIGR00756,PPR: pentatricopeptide repeat domain,IPR002885,Pentatricopeptide repeat,17
92717,Gene3D,G3DSA:1.25.40.10,-,IPR011990,Tetratricopeptide-like helical domain superfamily,15
92715,PANTHER,PTHR24015,-,,,14
189102,Pfam,PF13041,PPR repeat family,IPR002885,Pentatricopeptide repeat,5


**OG0003644**  
Walli genes: 11   
All interpro results: 20   
Unique interpro results: 3   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
103832,PANTHER,PTHR33484,-,,,9
103833,PANTHER,PTHR33484:SF3,-,,,7
501895,MobiDBLite,mobidb-lite,consensus disorder prediction,,,4


**OG0000769**  
Walli genes: 20   
All interpro results: 49   
Unique interpro results: 3   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
4045,PANTHER,PTHR35121,-,,,20
4046,PANTHER,PTHR35121:SF4,-,,,20
131473,MobiDBLite,mobidb-lite,consensus disorder prediction,,,9


**OG0005358**  
Walli genes: 9   
All interpro results: 116   
Unique interpro results: 11   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
30236,Gene3D,G3DSA:2.40.330.10,-,IPR015300,DNA-binding pseudobarrel domain superfamily,14
30240,PANTHER,PTHR31391,-,,,14
30243,ProSiteProfiles,PS50863,B3 DNA-binding domain profile.,IPR003340,B3 DNA binding domain,14
30246,SUPERFAMILY,SSF101936,DNA-binding pseudobarrel domain,IPR015300,DNA-binding pseudobarrel domain superfamily,14
30247,Pfam,PF02362,B3 DNA binding domain,IPR003340,B3 DNA binding domain,13
30237,CDD,cd10017,B3_DNA,IPR003340,B3 DNA binding domain,12
30239,PANTHER,PTHR31391:SF75,B3 DOMAIN-CONTAINING PROTEIN REM16,,,12
30252,SMART,SM01019,B3_2,IPR003340,B3 DNA binding domain,11
30238,MobiDBLite,mobidb-lite,consensus disorder prediction,,,10
470114,PANTHER,PTHR31391:SF2,-,,,1


**OG0001077**  
Walli genes: 18   
All interpro results: 117   
Unique interpro results: 6   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
114272,Gene3D,G3DSA:3.40.50.2000,Glycogen Phosphorylase B;,,,36
114268,CDD,cd03784,GT1_Gtf-like,,,18
114273,SUPERFAMILY,SSF53756,UDP-Glycosyltransferase/glycogen phosphorylase,,,18
114269,PANTHER,PTHR11926,GLUCOSYL/GLUCURONOSYL TRANSFERASES,,,15
114270,Pfam,PF00201,UDP-glucoronosyl and UDP-glucosyl transferase,IPR002213,UDP-glucuronosyl/UDP-glucosyltransferase,15
114271,ProSitePatterns,PS00375,UDP-glycosyltransferases signature.,IPR035595,"UDP-glycosyltransferase family, conserved site",15


**OG0002833**  
Walli genes: 12   
All interpro results: 99   
Unique interpro results: 20   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
163694,Gene3D,G3DSA:1.10.472.10,-,,,21
163695,PANTHER,PTHR10177:SF425,CYCLIN-J18,,,12
163696,PANTHER,PTHR10177,CYCLINE,IPR039361,Cyclin,12
172235,Pfam,PF00134,"Cyclin, N-terminal domain",IPR006671,"Cyclin, N-terminal",9
172236,SUPERFAMILY,SSF47954,Cyclin-like,IPR036915,Cyclin-like superfamily,9
279578,ProSiteProfiles,PS51450,Leucine-rich repeat profile.,IPR001611,Leucine-rich repeat,9
279599,SMART,SM00369,LRR_typ_2,IPR003591,"Leucine-rich repeat, typical subtype",8
279609,PANTHER,PTHR27000:SF660,-,,,3
279614,PANTHER,PTHR27000,-,,,3
279584,PRINTS,PR00019,Leucine-rich repeat signature,,,2


**OG0002836**  
Walli genes: 12   
All interpro results: 24   
Unique interpro results: 3   


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
66164,PANTHER,PTHR36485,-,,,12
66163,Pfam,PF15159,Phosphatidylinositol N-acetylglucosaminyltransferase subunit Y,IPR029164,Phosphatidylinositol N-acetylglucosaminyltransferase subunit Y,9
339384,PANTHER,PTHR36485:SF1,-,,,3


## Protien Domains (second set)

**OG0001651** - Ankyrin repeat: protein-protein interaction motif
Walli genes: 19   
All interpro results: 170   
Unique interpro results: 21   
- Ankyrin repeat-containing domain is a popularconclusion (https://www.ebi.ac.uk/interpro/entry/InterPro/IPR020683/)
- with a couple of retrotransposon matches

**OG0004401**  
Walli genes: 11   
All interpro results: 58   
Unique interpro results: 13   
- SUPERFAMILY and Gene3D have P-loop containing nucleoside triphosphate hydrolase: https://www.ebi.ac.uk/interpro/entry/InterPro/IPR027417/ ; http://www.cathdb.info/version/v4_2_0/superfamily/3.40.50.300
- Panther abd Pfam has it as an transporter, though when you look at the GO terms from panther they also have it as a presponse to stress
- then at a second level a lot of them have Citrate Synthase

**OG0003049**  
Walli genes: 13   
All interpro results: 79   
Unique interpro results: 9   
- Gene3D has it as an antibiotic? http://www.cathdb.info/version/v4_2_0/superfamily/2.60.120.330
- but all the rest seem to just be 2OG-Fe(II) oxygenase superfamily	- apparently the families overlap? http://www.ebi.ac.uk/interpro/entry/InterPro/IPR026992/

**OG0001024**  
Walli genes: 20   
All interpro results: 100   
Unique interpro results: 14   
- Seems very definitely a GOLGIN, which sems to be about maintaining the golgi apparatus https://www.uniprot.org/uniprot/Q7SXE4
- apparently that explains all the coils? https://en.wikipedia.org/wiki/GOLGA5

**OG0002516** - Probably a TE?  
Walli genes: 14   
All interpro results: 76   
Unique interpro results: 11   
- Panther: SERINE/THREONINE-PROTEIN PHOSPHATASE 7 LONG FORM-LIKE PROTEIN (PTHR46033:SF16) http://www.pantherdb.org/panther/family.do?clsAccession=PTHR46033:SF16
    - interestingly the lable they have in the signature description belongs to PTHR46033:SF20 http://www.pantherdb.org/panther/family.do?clsAccession=PTHR46033:SF20 and is a TE
- pfam has it as transposable http://pfam.xfam.org/family/PF10536

**OG0001842** - Lipase  
Walli genes: 16   
All interpro results: 95   
Unique interpro results: 12   
- Panther: GDSL LIPASE/ACYLHYDROLASE FAMILY PROTEIN (AFU_ORTHOLOGUE AFUA_4G14700) (PTHR45648)

**OG0001075**  
Walli genes: 19   
All interpro results: 52   
Unique interpro results: 3   
- MYOSIN-LIKE PROTEIN (PTHR35468): Myosins are motor protiens that make up muscles in animals.

**OG0002249**  
Walli genes: 14   
All interpro results: 26   
Unique interpro results: 10  
- appears to be a V-ATPase (proton pump) or an armadillo fold

**OG0003266** - Aspartic peptidase  
Walli genes: 12   
All interpro results: 99   
Unique interpro results: 8   
- aspartic peptidase https://www.ebi.ac.uk/interpro/entry/InterPro/IPR021109/
- even the Xylanase inhibitors are also hound in the aspartic peptidases: https://www.ebi.ac.uk/interpro/entry/InterPro/IPR032861/

**OG0004831** - Pentatricopeptide repeat  
Walli genes: 10   
All interpro results: 124   
Unique interpro results: 8   
- See OG0000062 and OG0000339

**OG0000724** - Calcium/lipid-binding domain, CaLB  
Walli genes: 21   
All interpro results: 147   
Unique interpro results: 10   
- https://www.ncbi.nlm.nih.gov/gene?cmd=DetailsSearch&term=AT3G61050: "It can bind ceramide and is involved in drought and salt tolerance."

**OG0003558**  
Walli genes: 11   
All interpro results: 64   
Unique interpro results: 12   
- Glycosidases and Glycosyl hydrolase are the same. Can't figure out what they do though
- FRIGIDA-like protein 1 in arabidopsis: https://www.uniprot.org/uniprot/Q9FFF1
     - Panther is also frigida-like
     
**OG0003647** - Pentatricopeptide repeat  
Walli genes: 11   
All interpro results: 100   
Unique interpro results: 6   
- There are a few more up there

**OG0003644**  
Walli genes: 11   
All interpro results: 20   
Unique interpro results: 3   
- un-named panther family

**OG0000769**  
Walli genes: 20   
All interpro results: 49   
Unique interpro results: 3   
- HOMEODOMAIN PROTEIN 8, PUTATIVE-RELATED (http://www.pantherdb.org/panther/family.do?clsAccession=PTHR35121:SF4)
- probably a transcription factor https://ghr.nlm.nih.gov/primer/genefamily/homeoboxes

**OG0005358**  
Walli genes: 9   
All interpro results: 116   
Unique interpro results: 11   
- Panther is the same as the ProSiteProfiles one below it
- Seems to be a DNA binding something: B3 is exclusively TFs https://en.wikipedia.org/wiki/B3_domain

**OG0001077** - Glycosyltransferase/glycogen phosphorylase - anti-biotics  
Walli genes: 18   
All interpro results: 117   
Unique interpro results: 6  
- cdd link https://www.ncbi.nlm.nih.gov/Structure/cdd/cd03784 - involved in the final stages of the biosynthesis of antibiotics

**OG0002833** - Cyclin-like & Leucine rich  
Walli genes: 12   
All interpro results: 99   
Unique interpro results: 20   
- gene3D http://www.cathdb.info/version/v4_2_0/superfamily/1.10.472.10

**OG0002836** - unclear  
Walli genes: 12   
All interpro results: 24   
Unique interpro results: 3  
-Panther http://www.pantherdb.org/panther/family.do?clsAccession=PTHR36485:SF1, but function unclear

So I would prabably like to look further into:
- The ones for TFs
- The  Pentatricopeptide repeats: appear to be mitochondria interacting? https://en.wikipedia.org/wiki/Pentatricopeptide_repeat
- The disease resistance ones

And should re-check both lists to see if anything else looks worth it

In [39]:
#print("\n".join(interesting + interesting_2))

## Summary of 38 "interesting" orthogroups
TEs:
- OG0000051
- OG0000107
- OG0002516
- OG0000357 (probably)

Disease?:
- OG0000029
- OG0000111 - a bit complicated
- OG0000647 - a bit complicated
- OG0003049 - antibiotic?
- OG0001077 - antibiotic?

Protien Kinase:
- OG0000208
- OG0000019

Pentatricopeptide repeat (mitochondria interacting):
- OG0000339
- OG0000062
- OG0004831
- OG0003647

Mitochondria/chloroplast:
- OG0000211

Protien interactions:
- OG0001315
- OG0001651

Possibly a Transcription factor:
- OG0000769
- OG0005358

Odd but specific:
- OG0000130 - Leucine rich repeat
- OG0002833 - Leucine rich & Cyclin-like
- OG0000170 - NAD-dependent something
- OG0000022 - Peptidase S10, serine carboxypeptidase
- OG0001024 - Golgi
- OG0001842 - Lipase
- OG0001075 - Myosin-like
- OG0000125 - Chromatin
- OG0002249 - V-ATPase (Proton pump)
- OG0003266 - Aspartic peptidase  
- OG0000724 - Calcium/lipid-binding domain, CaLB: salt and drought tolerance 
- OG0000082 - transferase of some sort

Unclear:
- OG0000038 - a few possible results (auxin signalling inhibition, other unknown genes)
- OG0000190 - a lot of info but idk how its all related
- OG0004401 - a few possible results (transporter, Citrate Synthase)
- OG0003558 - Glycosyl hydrolase and/or FRIGIDA-like protein 
- OG0003644 - no info
- OG0002836 - unclear




In [40]:
counts_interested = ortho_count_df.loc[(interesting + interesting_2)] 

In [41]:
dict_conc = {"OG0000051": "TEs", "OG0000107": "TEs",
             "OG0002516": "TEs", "OG0000357": "TEs", 
             "OG0000029": "Disease", "OG0000111": "Disease", 
             "OG0000647": "Disease", "OG0003049": "Disease - antibiotic?", 
             "OG0001077":  "Disease - antibiotic?",
             "OG0000208": "Protien Kinase", "OG0000019": "Protien Kinase",
             "OG0000339": "Pentatricopeptide repeat", "OG0000062": "Pentatricopeptide repeat", 
             "OG0004831": "Pentatricopeptide repeat", "OG0003647": "Pentatricopeptide repeat",
             "OG0000211": "Mitochondria/chloroplast",
             "OG0001315": "Protien interactions", "OG0001651": "Protien interactions",
             "OG0000769": "transcription factor possibly", "OG0005358": "transcription factor possibly",
             "OG0000130": "Leucine rich repeat", "OG0000170": "NAD-dependent", 
             "OG0000022": "Peptidase S10, serine carboxypeptidase", "OG0001024": "Golgi",
             "OG0001842": "Lipase", "OG0001075": "Myosin-like",
             "OG0000125": "Chromatin", "OG0002249": "V-ATPase (Proton pump)",
             "OG0003266": "Aspartic peptidase", "OG0000724": "Calcium/lipid-binding domain, CaLB: salt and drought tolerance",
             "OG0000082": "transferase of some sort", "OG0002833": "Leucine rich & Cyclin-like",
             "OG0000038": "Unclear",
             "OG0000190": "Unclear", "OG0004401": "Unclear",
             "OG0003558": "Unclear", "OG0002836": "Unclear",
             "OG0003644": "no info"}


In [42]:
counts_interested["info"] = counts_interested.index.map(dict_conc)

## Summary Table

In [43]:
counts_interested.sort_values("info")

Unnamed: 0_level_0,A_thaliana,S_lycopersicum,S_tuberosum,A_chinensis,D_carota,L_sativa,H_annuus,W_ceracea,Total,prop,info
Orthogroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
OG0003266,0,1,1,1,1,1,1,12,18,0.666667,Aspartic peptidase
OG0000724,1,1,1,2,1,3,2,21,32,0.65625,"Calcium/lipid-binding domain, CaLB: salt and d..."
OG0000125,3,11,6,4,8,3,12,21,68,0.308824,Chromatin
OG0000111,0,8,10,18,2,3,2,28,71,0.394366,Disease
OG0000647,1,1,1,2,0,0,0,28,33,0.848485,Disease
OG0000029,0,18,26,16,0,4,3,49,116,0.422414,Disease
OG0003049,0,0,1,2,0,1,1,13,18,0.722222,Disease - antibiotic?
OG0001077,0,4,2,2,0,2,0,18,28,0.642857,Disease - antibiotic?
OG0001024,1,1,0,2,1,1,2,20,28,0.714286,Golgi
OG0002833,1,1,1,1,1,1,1,12,19,0.631579,Leucine rich & Cyclin-like


In [44]:
#(counts_interested.sort_values("info")).to_csv("orthogroups_interesting.tsv", sep="\t")

### Further research

From Ben: "For the OGs of interest I would pick 3-4 groups and look what orthologs in Arabidopsis or so do if available. Go for the transcription factors, protein kinase, disease and maybe Calcium stuff. If no Arabidopsis available just do a blast search and see if any other homolog has a described function so you can write a bit of a story."

In [45]:
#counts_interested["info"].unique()

In [46]:
TFs = counts_interested[counts_interested["info"] == 'transcription factor possibly']
pk = counts_interested[counts_interested["info"] == 'Protien Kinase']
dis = counts_interested[counts_interested["info"].str.contains("Disease")]
cal = counts_interested[counts_interested["info"] == 'Calcium/lipid-binding domain, CaLB: salt and drought tolerance']

In [47]:
TFs.append(pk).append(dis).append(cal)

Unnamed: 0_level_0,A_thaliana,S_lycopersicum,S_tuberosum,A_chinensis,D_carota,L_sativa,H_annuus,W_ceracea,Total,prop,info
Orthogroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
OG0000769,0,3,2,1,5,0,0,20,31,0.645161,transcription factor possibly
OG0005358,1,1,1,2,0,0,0,9,14,0.642857,transcription factor possibly
OG0000208,0,1,1,2,2,6,4,38,54,0.703704,Protien Kinase
OG0000019,0,10,36,12,22,15,20,25,140,0.178571,Protien Kinase
OG0000029,0,18,26,16,0,4,3,49,116,0.422414,Disease
OG0000111,0,8,10,18,2,3,2,28,71,0.394366,Disease
OG0000647,1,1,1,2,0,0,0,28,33,0.848485,Disease
OG0003049,0,0,1,2,0,1,1,13,18,0.722222,Disease - antibiotic?
OG0001077,0,4,2,2,0,2,0,18,28,0.642857,Disease - antibiotic?
OG0000724,1,1,1,2,1,3,2,21,32,0.65625,"Calcium/lipid-binding domain, CaLB: salt and d..."


In [48]:
topics = [TFs, pk, dis, cal]
for x in topics:
    print(x["info"].unique())
    print(ortho_species_df.loc[x.index]["A_thaliana"])

['transcription factor possibly']
Orthogroup
OG0000769            NaN
OG0005358    AT4G33280.3
Name: A_thaliana, dtype: object
['Protien Kinase']
Orthogroup
OG0000208    NaN
OG0000019    NaN
Name: A_thaliana, dtype: object
['Disease' 'Disease - antibiotic?']
Orthogroup
OG0000029            NaN
OG0000111            NaN
OG0000647    AT5G36930.2
OG0003049            NaN
OG0001077            NaN
Name: A_thaliana, dtype: object
['Calcium/lipid-binding domain, CaLB: salt and drought tolerance']
Orthogroup
OG0000724    AT4G01200.1
Name: A_thaliana, dtype: object


So from looking at arabidopsis results, we have that:
- OG0005358 (TFs) contains AT4G33280.3 (https://gbrowseaws.arabidopsis.org/servlets/TairObject?type=gene&id=1000691561) which is "AP2/B3-like transcriptional factor family protein"
- OG0000647 (Dis) contains AT5G36930.2 (https://gbrowseaws.arabidopsis.org/servlets/TairObject?id=133733&type=locus) which is "Disease resistance protein (TIR-NBS-LRR class) family"
    - description of what TIR-NBS-LRR means (a motif homologous to the cytoplasmic domains of the Drosophila Toll protein and the mammalian interleukin-1 receptor (TIR), nucleotide-binding site (NBS), a leucine-rich repeat (LRR)) (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3472223/#__sec4title).
    - So it is about sequence homology rather than necessarily what the gene does - R gene
- OG0000724 (Cal) contains AT4G01200.1 (https://www.arabidopsis.org/servlets/TairObject?type=locus&name=AT4G01200) which is "Calcium-dependent lipid-binding (CaLB domain) family protein" as specified. it is apparently expressed in guard cells, but TAIR does not have records for the biological process. This other CaLB gene (https://www.arabidopsis.org/servlets/TairObject?type=locus&name=AT3G61050) is the one identified to be involved in salt and drought tollerance https://academic.oup.com/jxb/article/62/8/2679/478984
    - in that paper they talk abou the importance of C2 domains (which our unknown one does have) in stress responses generally "Eperiments with *Atclb* T-DNA knockout lines revealed that these mutants can withstand prolonged drought and salt stress, suggesting involvement of the *Atclb* gene in stress signalling as a negative regulator. Similarly, Yang *et al.* (2006) showed that *Arabidopsis* BAP1 protein with an evolutionarily conserved C2 domain is a negative regulator of defence responses."

In [49]:
ortho_species_df.loc[dis.index]

Unnamed: 0_level_0,A_thaliana,S_lycopersicum,S_tuberosum,A_chinensis,D_carota,L_sativa,H_annuus,W_ceracea
Orthogroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
OG0000029,,"Solyc02g036270.2.1, Solyc03g094100.1.1, Solyc0...","PGSC0003DMP400001764, PGSC0003DMP400005107, PG...","CEY00_Acc03815, CEY00_Acc03888, CEY00_Acc03889...",,"Lsat_1_v5_gn_0_10760.1, Lsat_1_v5_gn_1_83140.1...","HanXRQChr01g0017431, HanXRQChr09g0260361, HanX...","file_1_file_1_g11039.t1, file_1_file_1_g11040...."
OG0000111,,"Solyc01g102840.2.1, Solyc01g102850.1.1, Solyc0...","PGSC0003DMP400005063, PGSC0003DMP400022408, PG...","CEY00_Acc05254, CEY00_Acc06660, CEY00_Acc06661...","DCAR_022320, DCAR_029849","Lsat_1_v5_gn_5_101660.1, Lsat_1_v5_gn_5_101700...","HanXRQChr10g0293411, HanXRQChr10g0298171","file_1_file_1_g34922.t1, file_1_file_1_g34925...."
OG0000647,AT5G36930.2,Solyc09g007710.2.1,PGSC0003DMP400048338,"CEY00_Acc00750, CEY00_Acc12912",,,,"file_1_file_1_g17160.t1, file_1_file_1_g17160...."
OG0003049,,,PGSC0003DMP400017151,"CEY00_Acc14622, CEY00_Acc14623",,Lsat_1_v5_gn_9_8641.1,HanXRQChr10g0281011,"file_1_file_1_g4143.t1, file_1_file_1_g44729.t..."
OG0001077,,"Solyc07g043020.1.1, Solyc07g043030.1.1, Solyc0...","PGSC0003DMP400023996, PGSC0003DMP400057770","CEY00_Acc21775, CEY00_Acc21776",,"Lsat_1_v5_gn_5_138141.1, Lsat_1_v5_gn_9_116901.1",,"file_1_file_1_g27696.t1, file_1_file_1_g27697...."


Things I want to write out so I can blast these results:

In [50]:
disease = ortho_species_df.loc[dis.index[0]]["W_ceracea"].split(", ") + \
ortho_species_df.loc[dis.index[1]]["W_ceracea"].split(", ") + \
ortho_species_df.loc[dis.index[3]]["W_ceracea"].split(", ") + \
ortho_species_df.loc[dis.index[4]]["W_ceracea"].split(", ") # + \
#ortho_species_df.loc[dis.index[2]]["W_ceracea"].split(", ")

In [51]:
protien_k = ortho_species_df.loc[pk.index[0]]["W_ceracea"].split(", ") + ortho_species_df.loc[pk.index[1]]["W_ceracea"].split(", ")

In [52]:
trans_f = ortho_species_df.loc[TFs.index[0]]["W_ceracea"].split(", ")

In [53]:
#with open("/home/jemimah/analysis/v3/orthofinder/20200911/interesting_protein_lists/disease.list", "w") as f:
#    f.write("\n".join(disease) + "\n")
#with open("/home/jemimah/analysis/v3/orthofinder/20200911/interesting_protein_lists/protien_k.list", "w") as f:
#    f.write("\n".join(protien_k) + "\n")
#with open("/home/jemimah/analysis/v3/orthofinder/20200911/interesting_protein_lists/trans_f.list", "w") as f:
#    f.write("\n".join(trans_f) + "\n")

In [54]:
#getting the disease results for each orthogroup:
dis_OG0000029 = ortho_species_df.loc[dis.index[0]]["W_ceracea"].split(", ")
dis_OG0000111 = ortho_species_df.loc[dis.index[1]]["W_ceracea"].split(", ")
dis_OG0000647 = ortho_species_df.loc[dis.index[2]]["W_ceracea"].split(", ")
with open("/home/jemimah/analysis/v3/orthofinder/20200911/interesting_protein_lists/dis_OG0000029.list", "w") as f:
    f.write("\n".join(dis_OG0000029) + "\n")
with open("/home/jemimah/analysis/v3/orthofinder/20200911/interesting_protein_lists/dis_OG0000111.list", "w") as f:
    f.write("\n".join(dis_OG0000111) + "\n")
with open("/home/jemimah/analysis/v3/orthofinder/20200911/interesting_protein_lists/dis_OG0000647.list", "w") as f:
    f.write("\n".join(dis_OG0000647) + "\n")

In [55]:
%%bash
cd /home/jemimah/analysis/v3/orthofinder/20200911/interesting_protein_lists/
#for x in *.list; do seqtk subseq ~/analysis/v3/braker2/20200709_real/braker2.codingseq ${x} > ${x%%.list}.fa; done
#for x in dis_*.fa; do grep -v ">" ${x} > ${x%%.fa}_merged.fa; done
wc -l *.list
grep -c ">" *.fa

 108 disease.list
  49 dis_OG0000029.list
  28 dis_OG0000111.list
  28 dis_OG0000647.list
  63 protien_k.list
  20 trans_f.list
 296 total
disease.fa:108
disease_merged.fa:0
dis_OG0000029.fa:49
dis_OG0000029_merged.fa:0
dis_OG0000111.fa:28
dis_OG0000111_merged.fa:0
dis_OG0000647.fa:28
dis_OG0000647_merged.fa:0
protien_k.fa:63
protien_k_merged.fa:0
trans_f.fa:20
trans_f_merged.fa:0


## Topics Summary of above

In [56]:
topics = [TFs, pk, dis, cal]
topics_ogs = {}
for x in topics:
    topics_ogs[x["info"][0]] = list(x.index)

#### Transcription Factors

In [57]:
print(TFs["info"].unique()[0])
ips_summary(list(TFs.index), "table")
TFs

transcription factor possibly
OG0000769


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
4045,PANTHER,PTHR35121,-,,,20
4046,PANTHER,PTHR35121:SF4,-,,,20
131473,MobiDBLite,mobidb-lite,consensus disorder prediction,,,9


OG0005358


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
30236,Gene3D,G3DSA:2.40.330.10,-,IPR015300,DNA-binding pseudobarrel domain superfamily,14
30240,PANTHER,PTHR31391,-,,,14
30243,ProSiteProfiles,PS50863,B3 DNA-binding domain profile.,IPR003340,B3 DNA binding domain,14
30246,SUPERFAMILY,SSF101936,DNA-binding pseudobarrel domain,IPR015300,DNA-binding pseudobarrel domain superfamily,14
30247,Pfam,PF02362,B3 DNA binding domain,IPR003340,B3 DNA binding domain,13
30237,CDD,cd10017,B3_DNA,IPR003340,B3 DNA binding domain,12
30239,PANTHER,PTHR31391:SF75,B3 DOMAIN-CONTAINING PROTEIN REM16,,,12
30252,SMART,SM01019,B3_2,IPR003340,B3 DNA binding domain,11
30238,MobiDBLite,mobidb-lite,consensus disorder prediction,,,10
470114,PANTHER,PTHR31391:SF2,-,,,1


Unnamed: 0_level_0,A_thaliana,S_lycopersicum,S_tuberosum,A_chinensis,D_carota,L_sativa,H_annuus,W_ceracea,Total,prop,info
Orthogroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
OG0000769,0,3,2,1,5,0,0,20,31,0.645161,transcription factor possibly
OG0005358,1,1,1,2,0,0,0,9,14,0.642857,transcription factor possibly


In [67]:
for x in TFs.columns[:-4]:
    print(x)
    print(ortho_species_df.loc[TFs.index][x])
    print(list(ortho_species_df.loc[TFs.index][x]))

A_thaliana
Orthogroup
OG0000769            NaN
OG0005358    AT4G33280.3
Name: A_thaliana, dtype: object
[nan, 'AT4G33280.3']
S_lycopersicum
Orthogroup
OG0000769    Solyc07g007070.1.1, Solyc07g007080.1.1, Solyc0...
OG0005358                                   Solyc02g090170.2.1
Name: S_lycopersicum, dtype: object
['Solyc07g007070.1.1, Solyc07g007080.1.1, Solyc07g007100.1.1', 'Solyc02g090170.2.1']
S_tuberosum
Orthogroup
OG0000769    PGSC0003DMP400053545, PGSC0003DMP400053553
OG0005358                          PGSC0003DMP400017857
Name: S_tuberosum, dtype: object
['PGSC0003DMP400053545, PGSC0003DMP400053553', 'PGSC0003DMP400017857']
A_chinensis
Orthogroup
OG0000769                    CEY00_Acc00677
OG0005358    CEY00_Acc04681, CEY00_Acc23701
Name: A_chinensis, dtype: object
['CEY00_Acc00677', 'CEY00_Acc04681, CEY00_Acc23701']
D_carota
Orthogroup
OG0000769    DCAR_014925, DCAR_014926, DCAR_014927, DCAR_01...
OG0005358                                                  NaN
Name: D_carota, dtyp

**OG0000769**   
Walli genes: 20   
All interpro results: 49   
Unique interpro results: 3   
- HOMEODOMAIN PROTEIN 8, PUTATIVE-RELATED (http://www.pantherdb.org/panther/family.do?clsAccession=PTHR35121:SF4)
- probably a transcription factor https://www.sciencedirect.com/science/article/pii/096800049290434B?via%3Dihub
- consensus disorder prediction also present in TFs

other protiens in this panther family for arabidopsis are uncharacterised and dissordered.  

because I can't find anything of interest for this set, and even find out what panther means by "homeodomain protien 8 " (there are a lot of homeodomain protiens???) I am dropping it.

**OG0005358**  
Walli genes: 9   
All interpro results: 116   
Unique interpro results: 11   
- Panther is the same as the ProSiteProfiles one below it
- Seems to be a DNA binding something: B3 is exclusively TFs https://en.wikipedia.org/wiki/B3_domain  
  
    
- OG0005358 (TFs) contains AT4G33280.3 (https://gbrowseaws.arabidopsis.org/servlets/TairObject?type=gene&id=1000691561) which is "AP2/B3-like transcriptional factor family protein"
- the interproscan results for this gene also have the barrel thing http://www.ebi.ac.uk/interpro/search/text/A0A1P8B3A7/#table

[plant specific???]

blast for trans_f.fa reports no significant matches for highly simillar.  
at less simllar, mostly uncharacterised sequence matches.

#### Protien Kinases

In [59]:
print(pk["info"].unique()[0])
ips_summary(list(pk.index), "table")
pk

Protien Kinase
OG0000208


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
23220,Gene3D,G3DSA:1.10.510.10,Transferase(Phosphotransferase) domain 1,,,35
23234,SUPERFAMILY,SSF56112,Protein kinase-like (PK-like),IPR011009,Protein kinase-like domain superfamily,34
23230,ProSiteProfiles,PS50011,Protein kinase domain profile.,IPR000719,Protein kinase domain,33
1328,Gene3D,G3DSA:3.30.200.20,Phosphorylase Kinase; domain 1,,,32
23237,Pfam,PF00069,Protein kinase domain,IPR000719,Protein kinase domain,32
23218,PANTHER,PTHR27002:SF507,-,,,31
23231,PANTHER,PTHR27002,-,,,31
1330,CDD,cd01098,PAN_AP_plant,,,28
23219,ProSitePatterns,PS00108,Serine/Threonine protein kinases active-site signature.,IPR008271,"Serine/threonine-protein kinase, active site",28
23224,SMART,SM00220,serkin_6,IPR000719,Protein kinase domain,28


OG0000019


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
84613,PANTHER,PTHR27009,-,,,41
84618,PANTHER,PTHR27009:SF111,-,,,26
84615,Gene3D,G3DSA:1.10.510.10,Transferase(Phosphotransferase) domain 1,,,26
84617,SUPERFAMILY,SSF56112,Protein kinase-like (PK-like),IPR011009,Protein kinase-like domain superfamily,22
84614,ProSiteProfiles,PS50011,Protein kinase domain profile.,IPR000719,Protein kinase domain,21
84612,Pfam,PF00069,Protein kinase domain,IPR000719,Protein kinase domain,17
84621,Gene3D,G3DSA:3.30.200.20,Phosphorylase Kinase; domain 1,,,16
84611,SMART,SM00220,serkin_6,IPR000719,Protein kinase domain,15
84610,ProSitePatterns,PS00108,Serine/Threonine protein kinases active-site signature.,IPR008271,"Serine/threonine-protein kinase, active site",15
107972,PANTHER,PTHR27009:SF124,-,,,12


Unnamed: 0_level_0,A_thaliana,S_lycopersicum,S_tuberosum,A_chinensis,D_carota,L_sativa,H_annuus,W_ceracea,Total,prop,info
Orthogroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
OG0000208,0,1,1,2,2,6,4,38,54,0.703704,Protien Kinase
OG0000019,0,10,36,12,22,15,20,25,140,0.178571,Protien Kinase


In [60]:
for x in pk.columns[:-4]:
    print(x)
    print(ortho_species_df.loc[pk.index][x])
    print(list(ortho_species_df.loc[pk.index][x]))

A_thaliana
Orthogroup
OG0000208    NaN
OG0000019    NaN
Name: A_thaliana, dtype: object
[nan, nan]
S_lycopersicum
Orthogroup
OG0000208                                   Solyc04g078410.2.1
OG0000019    Solyc02g079610.2.1, Solyc02g081470.2.1, Solyc0...
Name: S_lycopersicum, dtype: object
['Solyc04g078410.2.1', 'Solyc02g079610.2.1, Solyc02g081470.2.1, Solyc02g081510.1.1, Solyc02g086210.2.1, Solyc03g007210.2.1, Solyc04g007380.1.1, Solyc05g009040.2.1, Solyc05g009050.2.1, Solyc05g009090.1.1, Solyc12g040370.1.1']
S_tuberosum
Orthogroup
OG0000208                                 PGSC0003DMP400005946
OG0000019    PGSC0003DMP400008038, PGSC0003DMP400010170, PG...
Name: S_tuberosum, dtype: object
['PGSC0003DMP400005946', 'PGSC0003DMP400008038, PGSC0003DMP400010170, PGSC0003DMP400010663, PGSC0003DMP400016043, PGSC0003DMP400021362, PGSC0003DMP400021364, PGSC0003DMP400021365, PGSC0003DMP400024484, PGSC0003DMP400024750, PGSC0003DMP400024751, PGSC0003DMP400024752, PGSC0003DMP400024753, PGSC0003DMP40002

**OG0000208** - Protein Kinases  
Walli genes: 38   
All interpro results: 571   
Unique interpro results: 38  
- Panther is also Protein Kinases

~**OG0000019**~ -  Protein kinases, maybe rust resistent?
Walli genes: 25   
All interpro results: 244   
Unique interpro results: 20  
- http://www.pantherdb.org/panther/family.do?clsAccession=PTHR27009: RUST RESISTANCE KINASE LR10-RELATED
    - cannot find how it is rust resistant??
    
For both, all Tomato genes are Serine/threonine protein kinase (based off their interproscan results)(eg.http://223.31.159.9/tomato2/getGene.php?trans_fac_id=Solyc04g078410.2.1) except for one (http://223.31.159.9/tomato2/getGene.php?trans_fac_id=Solyc12g040370.1.1) which is "core"

Blast online alignments for protien_k_merged.fa (looking at specific genes with protien_k.fa):
- predicted putative receptor protein kinase ZmPK1 in a range of species (ZmPK1 refferes to the maize Protien Kinase 1 described here: https://www.nature.com/articles/345743a0)
- The second half has far more rust resistent ones - LR10-like (Paper for LR10 - https://pubmed.ncbi.nlm.nih.gov/9700067/)

Both ZmPK1 (https://www.uniprot.org/uniprot/P17801) and LR10 (https://www.uniprot.org/uniprot/P93604) are Serine/threonine protein kinases.  
Both are thought to be probable receptors  


Diseases

In [61]:
print(dis["info"].unique()[0])
ips_summary(list(dis.index), "table")
dis

Disease
OG0000029


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
71047,PRINTS,PR00364,Disease resistance protein signature,,,61
25035,PANTHER,PTHR33463,-,,,58
25038,PANTHER,PTHR33463:SF21,-,,,55
25037,Gene3D,G3DSA:3.80.10.10,Ribonuclease Inhibitor,IPR032675,Leucine-rich repeat domain superfamily,50
25040,SUPERFAMILY,SSF52058,L domain-like,,,29
25043,Pfam,PF00931,NB-ARC domain,IPR002182,NB-ARC,23
25039,SUPERFAMILY,SSF52540,P-loop containing nucleoside triphosphate hydrolases,IPR027417,P-loop containing nucleoside triphosphate hydrolase,22
60638,Gene3D,G3DSA:3.40.50.300,-,,,20
54777,MobiDBLite,mobidb-lite,consensus disorder prediction,,,18
59350,Coils,Coil,Coil,,,15


OG0000111


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
28718,PRINTS,PR00364,Disease resistance protein signature,,,60
28715,Gene3D,G3DSA:3.80.10.10,Ribonuclease Inhibitor,IPR032675,Leucine-rich repeat domain superfamily,57
28713,PANTHER,PTHR11017,LEUCINE-RICH REPEAT-CONTAINING PROTEIN,,,53
28727,PANTHER,PTHR11017:SF298,-,,,47
28722,SUPERFAMILY,SSF52058,L domain-like,,,29
28724,Gene3D,G3DSA:3.40.50.10140,-,IPR035897,Toll/interleukin-1 receptor homology (TIR) domain superfamily,26
28726,Pfam,PF01582,TIR domain,IPR000157,Toll/interleukin-1 receptor homology (TIR) domain,24
28725,SUPERFAMILY,SSF52200,Toll/Interleukin receptor TIR domain,IPR035897,Toll/interleukin-1 receptor homology (TIR) domain superfamily,24
28732,SMART,SM00255,till_3,IPR000157,Toll/interleukin-1 receptor homology (TIR) domain,22
28712,ProSiteProfiles,PS50104,TIR domain profile.,IPR000157,Toll/interleukin-1 receptor homology (TIR) domain,22


OG0000647


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
14399,PRINTS,PR00364,Disease resistance protein signature,,,70
14397,PANTHER,PTHR11017,LEUCINE-RICH REPEAT-CONTAINING PROTEIN,,,34
68262,ProSiteProfiles,PS51450,Leucine-rich repeat profile.,IPR001611,Leucine-rich repeat,33
14404,PANTHER,PTHR11017:SF301,-,,,31
68275,Gene3D,G3DSA:3.80.10.10,Ribonuclease Inhibitor,IPR032675,Leucine-rich repeat domain superfamily,25
14403,SUPERFAMILY,SSF52540,P-loop containing nucleoside triphosphate hydrolases,IPR027417,P-loop containing nucleoside triphosphate hydrolase,22
14396,Gene3D,G3DSA:3.40.50.300,-,,,21
68264,SUPERFAMILY,SSF52058,L domain-like,,,19
14395,Pfam,PF00931,NB-ARC domain,IPR002182,NB-ARC,18
14402,Gene3D,G3DSA:3.40.50.10140,-,IPR035897,Toll/interleukin-1 receptor homology (TIR) domain superfamily,18


OG0003049


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
214700,Gene3D,G3DSA:2.60.120.330,-,IPR027443,Isopenicillin N synthase-like,14
214701,PANTHER,PTHR10209,"OXIDOREDUCTASE, 2OG-FE II OXYGENASE FAMILY PROTEIN",,,14
214698,SUPERFAMILY,SSF51197,Clavaminate synthase-like,,,13
214699,PANTHER,PTHR10209:SF687,-,,,11
222647,Pfam,PF03171,2OG-Fe(II) oxygenase superfamily,IPR005123,Oxoglutarate/iron-dependent dioxygenase,9
222649,ProSiteProfiles,PS51471,Fe(2+) 2-oxoglutarate dioxygenase domain profile.,IPR005123,Oxoglutarate/iron-dependent dioxygenase,8
222652,Pfam,PF14226,non-haem dioxygenase in morphine synthesis N-terminal,IPR026992,Non-haem dioxygenase N-terminal domain,7
697221,PANTHER,PTHR10209:SF451,-,,,2
410550,PANTHER,PTHR10209:SF508,-,,,1


OG0001077


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
114272,Gene3D,G3DSA:3.40.50.2000,Glycogen Phosphorylase B;,,,36
114268,CDD,cd03784,GT1_Gtf-like,,,18
114273,SUPERFAMILY,SSF53756,UDP-Glycosyltransferase/glycogen phosphorylase,,,18
114269,PANTHER,PTHR11926,GLUCOSYL/GLUCURONOSYL TRANSFERASES,,,15
114270,Pfam,PF00201,UDP-glucoronosyl and UDP-glucosyl transferase,IPR002213,UDP-glucuronosyl/UDP-glucosyltransferase,15
114271,ProSitePatterns,PS00375,UDP-glycosyltransferases signature.,IPR035595,"UDP-glycosyltransferase family, conserved site",15


Unnamed: 0_level_0,A_thaliana,S_lycopersicum,S_tuberosum,A_chinensis,D_carota,L_sativa,H_annuus,W_ceracea,Total,prop,info
Orthogroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
OG0000029,0,18,26,16,0,4,3,49,116,0.422414,Disease
OG0000111,0,8,10,18,2,3,2,28,71,0.394366,Disease
OG0000647,1,1,1,2,0,0,0,28,33,0.848485,Disease
OG0003049,0,0,1,2,0,1,1,13,18,0.722222,Disease - antibiotic?
OG0001077,0,4,2,2,0,2,0,18,28,0.642857,Disease - antibiotic?


In [62]:
set(list(ortho_species_df.loc[["OG0000111"]]["W_ceracea"])) & set(list(ortho_species_df.loc[["OG0000647"]]["W_ceracea"]))
#set(list(ortho_species_df.loc[["OG0000111"]]["W_ceracea"])) & set(list(ortho_species_df.loc[["OG0000029"]]["W_ceracea"]))
#set(list(ortho_species_df.loc[["OG0000029"]]["W_ceracea"])) & set(list(ortho_species_df.loc[["OG0000647"]]["W_ceracea"]))

set()

In [63]:
print("OG0000029")
for x in pk.columns[:-4]:
    print(x)
    print(ortho_species_df.loc["OG0000029"][x])
print("OG0000111")
for x in pk.columns[:-4]:
    print(x)
    print(ortho_species_df.loc["OG0000111"][x])

OG0000029
A_thaliana
nan
S_lycopersicum
Solyc02g036270.2.1, Solyc03g094100.1.1, Solyc04g005540.2.1, Solyc04g005550.1.1, Solyc04g048920.1.1, Solyc07g044800.2.1, Solyc10g054940.1.1, Solyc10g054970.1.1, Solyc10g054990.1.1, Solyc10g055050.1.1, Solyc10g055110.1.1, Solyc10g055140.1.1, Solyc10g055170.1.1, Solyc12g005530.1.1, Solyc12g016220.1.1, Solyc12g044180.1.1, Solyc12g044190.1.1, Solyc12g044200.1.1
S_tuberosum
PGSC0003DMP400001764, PGSC0003DMP400005107, PGSC0003DMP400005435, PGSC0003DMP400013406, PGSC0003DMP400014641, PGSC0003DMP400019233, PGSC0003DMP400019235, PGSC0003DMP400019349, PGSC0003DMP400019350, PGSC0003DMP400020238, PGSC0003DMP400020241, PGSC0003DMP400020244, PGSC0003DMP400023299, PGSC0003DMP400026426, PGSC0003DMP400029390, PGSC0003DMP400029592, PGSC0003DMP400030444, PGSC0003DMP400030446, PGSC0003DMP400032936, PGSC0003DMP400037237, PGSC0003DMP400038006, PGSC0003DMP400038184, PGSC0003DMP400048746, PGSC0003DMP400048747, PGSC0003DMP400048785, PGSC0003DMP400065504
A_chinensis
CEY00_

**OG0000029** - disease resistance  
Walli genes: 49   
All interpro results: 391   
Unique interpro results: 21  
- Prints result is for disease resistence (hard to find more info on, though)
- PTHR33463:SF21 can't be found??? but PTHR33463 is a family with 973 genes, some of the subfamilies being disease resistence http://www.pantherdb.org/list/list.do?numPerPage=100&save=yes&searchModType=numperpage&listType=6

Tomato  
- Solyc02g036270.2.1 desease resistance: NB-LRR family (https://string-db.org/network/4081.Solyc02g036270.2.1, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5832704/)
- generally can be found: http://223.31.159.9/tomato2/r-genes.html

**OG0000111** - Maybe disease resistance?   
Walli genes: 28    
All interpro results: 460   
Unique interpro results: 23  
- Seems to be a mixture of results I'vve already had?
    - PRINTS reports desiease resistence
    - Gene3D. Panther, others report Leucine-rich repeat domain
    - Toll/interleukin results seem in plant cytoplasmic host defence https://www.ebi.ac.uk/interpro/entry/InterPro/IPR000157/)
  
- Punitative tomato r genes in matches:
    - http://223.31.159.9/tomato2/getGene.php?trans_fac_id=Solyc02g032650.2.1; http://223.31.159.9/tomato2/getGene.php?trans_fac_id=Solyc02g032230.1.1; http://223.31.159.9/tomato2/getGene.php?trans_fac_id=Solyc02g032230.1.1 ; http://prgdb.org/prgdb/genes/type/putative/2135094; http://www.prgdb.org/prgdb/genes/type/putative/2135035
  
- combine with the next one in discussion?
    - Different orthogroup match rates
    - different Walli genes
    - :. different "origin"
    - same domains :. same function?

**OG0000647** - very simillar to the one above   
Walli genes: 28   
All interpro results: 388   
Unique interpro results: 27  
- might actuaklly be the same results (with different counts)
- if they're so similalr, why are they two different orthogroups?  
  
  
- OG0000647 (Dis) contains AT5G36930.2 (https://gbrowseaws.arabidopsis.org/servlets/TairObject?id=133733&type=locus) which is "Disease resistance protein (TIR-NBS-LRR class) family"
    - description of what TIR-NBS-LRR means (a motif homologous to the cytoplasmic domains of the Drosophila Toll protein and the mammalian interleukin-1 receptor (TIR), nucleotide-binding site (NBS), a leucine-rich repeat (LRR)) (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3472223/#__sec4title).
    - So it is about sequence homology rather than necessarily what the gene does - R gene
    - it is a type D (https://link.springer.com/article/10.1007%2Fs11103-008-9293-9) which means that it's last residue is D (Aspartate)
    - we dont know what they do. https://genomebiology.biomedcentral.com/articles/10.1186/gb-2006-7-4-212, and there are usually giant families of them in plants.
    
    
~**OG0003049**~  
Walli genes: 13   
All interpro results: 79   
Unique interpro results: 9   
- Gene3D has it as an antibiotic? http://www.cathdb.info/version/v4_2_0/superfamily/2.60.120.330
- but all the rest seem to just be 2OG-Fe(II) oxygenase superfamily	- apparently the families overlap? http://www.ebi.ac.uk/interpro/entry/InterPro/IPR026992/
- antibiotic is too complicated

~**OG0001077**~ - Glycosyltransferase/glycogen phosphorylase - anti-biotics or pigment?  
Walli genes: 18  
All interpro results: 117  
Unique interpro results: 6  

- cdd link https://www.ncbi.nlm.nih.gov/Structure/cdd/cd03784 - involved in the final stages of the biosynthesis of antibiotics

- interproscan codes have it for plants as anthocyanin pigment biosynthesis (https://www.ebi.ac.uk/interpro/entry/InterPro/IPR002213/, https://www.ebi.ac.uk/interpro/entry/InterPro/IPR035595/)

Three orthogroups likely disease resistance:
1.	Unknown disease resistance (OG0000029)
    - 49 Walli, present in all else except Arabidopsis and carrot (3-26)
    - Matches Tomato genes with known? Gene resistance through exact properties unknown
        -	Solyc02g036270.2.1 desease resistance: NB-LRR family (https://string-db.org/network/4081.Solyc02g036270.2.1, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5832704/)
        -	NBS-LRR and NB-LRR are the same?
        - Blast results have “like” these Arabidopsis genes: https://www.uniprot.org/uniprot/Q9T048, https://www.arabidopsis.org/servlets/TairObject?name=AT1G12280&type=locus (likely to be a bad match, because a different kind of resistance gene with interproscan results mine don’t have), https://www.arabidopsis.org/servlets/TairObject?accession=locus:2131689 
    - Compared to next one, has different species in the orthogroup and different walli genes (difference of orthology?) but same domains (same function?)
2.	Unsorted disease resistence (OG0000111)
    -	28 Walli genes, present in all except a thaliana (2-18)
    -	Matches tomato genes with punitative r gene roles
    -	Very simillar to the next one
3.	TIR-NBS-LRR class (OG0000647)
    -	28 Walli genes, doesn’t match carrot, lettuce, sunflower. (1-2 matches for others)
    -	Arabidopsis match is declared disease resistance (is this punitative too?). 
    -	Generally, we don’t know how R genes work and are combining homology with testing a few. 



#### Calcium

In [64]:
print(cal["info"].unique()[0])
ips_summary(list(cal.index), "table")
cal

Calcium/lipid-binding domain, CaLB: salt and drought tolerance
OG0000724


Unnamed: 0,Analysis,Signature_accession,Signature_description,Interpro_annotation,Interpro_description,count
4483,PANTHER,PTHR32246:SF69,CALCIUM-DEPENDENT LIPID-BINDING (CALB DOMAIN) FAMILY PROTEIN,,,21
4487,PANTHER,PTHR32246,-,,,21
80812,MobiDBLite,mobidb-lite,consensus disorder prediction,,,17
4484,Gene3D,G3DSA:2.60.40.150,-,IPR035892,C2 domain superfamily,16
4485,Pfam,PF00168,C2 domain,IPR000008,C2 domain,16
4486,SUPERFAMILY,SSF49562,"C2 domain (Calcium/lipid-binding domain, CaLB)",,,16
4489,ProSiteProfiles,PS50004,C2 domain profile.,IPR000008,C2 domain,15
4488,SMART,SM00239,C2_3c,IPR000008,C2 domain,14
122976,CDD,cd04051,C2_SRC2_like,,,10
263720,Coils,Coil,Coil,,,1


Unnamed: 0_level_0,A_thaliana,S_lycopersicum,S_tuberosum,A_chinensis,D_carota,L_sativa,H_annuus,W_ceracea,Total,prop,info
Orthogroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
OG0000724,1,1,1,2,1,3,2,21,32,0.65625,"Calcium/lipid-binding domain, CaLB: salt and d..."


In [65]:
for x in cal.columns[:-4]:
    print(x)
    print(ortho_species_df.loc["OG0000724"][x])

A_thaliana
AT4G01200.1
S_lycopersicum
Solyc01g095360.2.1
S_tuberosum
PGSC0003DMP400000188
A_chinensis
CEY00_Acc07593, CEY00_Acc15949
D_carota
DCAR_025234
L_sativa
Lsat_1_v5_gn_0_42241.1, Lsat_1_v5_gn_4_34140.1, Lsat_1_v5_gn_5_79441.1
H_annuus
HanXRQChr16g0523831, HanXRQChr17g0561051


**OG0000724** - Calcium/lipid-binding domain, CaLB  
Walli genes: 21   
All interpro results: 147   
Unique interpro results: 10   
- https://www.ncbi.nlm.nih.gov/gene?cmd=DetailsSearch&term=AT3G61050: "It can bind ceramide and is involved in drought and salt tolerance."

- OG0000724 (Cal) contains AT4G01200.1 (https://www.arabidopsis.org/servlets/TairObject?type=locus&name=AT4G01200) which is "Calcium-dependent lipid-binding (CaLB domain) family protein" as specified. it is apparently expressed in guard cells, but TAIR does not have records for the biological process. This other CaLB gene (https://www.arabidopsis.org/servlets/TairObject?type=locus&name=AT3G61050) is the one identified to be involved in salt and drought tollerance https://academic.oup.com/jxb/article/62/8/2679/478984
    - in that paper they talk abou the importance of C2 domains (which our unknown one does have) in stress responses generally "Eperiments with *Atclb* T-DNA knockout lines revealed that these mutants can withstand prolonged drought and salt stress, suggesting involvement of the *Atclb* gene in stress signalling as a negative regulator. Similarly, Yang *et al.* (2006) showed that *Arabidopsis* BAP1 protein with an evolutionarily conserved C2 domain is a negative regulator of defence responses."

In [66]:
keeps = ["OG0005358", "OG0000029", "OG0000111", "OG0000647", "OG0000208", "OG0000019", "OG0000724"]
keeps_df = TFs.append(pk).append(dis).append(cal)
keeps_df = keeps_df[keeps_df.index.isin(keeps)].reset_index()
paper_headers = list(keeps_df.columns[:-2].append(keeps_df.columns[-1:]))
keeps_df = keeps_df[paper_headers]
keeps_df

Unnamed: 0,Orthogroup,A_thaliana,S_lycopersicum,S_tuberosum,A_chinensis,D_carota,L_sativa,H_annuus,W_ceracea,Total,info
0,OG0005358,1,1,1,2,0,0,0,9,14,transcription factor possibly
1,OG0000208,0,1,1,2,2,6,4,38,54,Protien Kinase
2,OG0000019,0,10,36,12,22,15,20,25,140,Protien Kinase
3,OG0000029,0,18,26,16,0,4,3,49,116,Disease
4,OG0000111,0,8,10,18,2,3,2,28,71,Disease
5,OG0000647,1,1,1,2,0,0,0,28,33,Disease
6,OG0000724,1,1,1,2,1,3,2,21,32,"Calcium/lipid-binding domain, CaLB: salt and d..."
