In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Create FASTA files of protein sequences to pass to OrthoFinder

In [98]:
base_dir = '/content/drive/My Drive/Colab Notebooks/MIT 20.C51/Project/'
filename = 'df_operons.csv'
df = pd.read_csv(os.path.join(base_dir, filename)) 
# filter out gene sequences whose lengths aren't multiples of 3
df = df[df['gene_sequence'].map(len)%3==0]
# filter out gene sequences that contain non ATGC letters
nonATGC = [any(letter not in ['A','T','G','C'] for letter in seq) for seq in df["gene_sequence"]]
datafilter = [not elem for elem in nonATGC]
data = df[datafilter]
data.index = range(data.shape[0])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
data.shape

(92624, 13)

In [None]:
def writeFasta (data, filename):
  fastaLines = []
  for i in range(data.shape[0]):
    fastaLines.append(">" + data["locus"][i] + "| [" + data["SpeciesName"][i]+ "]")
    fastaLines.append("\n" + data["protein_sequence"][i] + "\n")
  # output to file
  with open(filename, 'w+') as f:
    f.writelines(fastaLines)

In [None]:
species = data.SpeciesName.unique() #64 species
species[0:5]

array(['Bacteroides fragilis 638R',
       'Bacteroides thetaiotaomicron VPI-5482',
       'Burkholderia pseudomallei K96243',
       'Burkholderia thailandensis E264', 'Escherichia coli K-12'],
      dtype=object)

In [None]:
fasta_dir = os.path.join(base_dir, 'Fasta')
fasta_dir

'/content/drive/My Drive/Colab Notebooks/MIT 20.C51/Project/Fasta'

In [None]:
for bacteria in species:
  dataset = data[data["SpeciesName"]==bacteria]
  dataset.index = range(dataset.shape[0])
  bac = bacteria.replace(" ", "_")
  filename = os.path.join(fasta_dir, bac + ".faa")
  writeFasta(dataset, filename)

# Process OrthoFinder results

In [99]:
filename = 'Orthogroups_clean.txt'
ortholog = pd.read_csv(os.path.join(base_dir, filename), sep='\t') 

In [100]:
ortholog.head()

Unnamed: 0,Orthogroup,SpeciesName,GeneLoci
0,OG0000000,Acinetobacter baumannii ATCC 17978,A1S_0026|A1S_0144|A1S_0986|A1S_1059|A1S_1060|A...
1,OG0000000,Acinetobacter baylyi ADP1,ACIAD0034|ACIAD0175|ACIAD0969|ACIAD1058|ACIAD1...
2,OG0000000,Agrobacterium fabrum str. C58,Atu0159|Atu0174|Atu0190|Atu0197|Atu0308|Atu033...
3,OG0000000,Azospira oryzae PS,Dsui_0467|Dsui_0705|Dsui_1167|Dsui_2857|Dsui_3...
4,OG0000000,Azospirillum baldaniorum,AZOBR_RS02260|AZOBR_RS07285|AZOBR_RS12725|


In [182]:
Orthogroup = ['temp']
SpeciesName = ['temp']
Locus = ['temp']

for i in range(ortholog.shape[0]):
  loci = ortholog['GeneLoci'][i].split('|')
  loci.remove('')
  Locus.extend(loci)
  Orthogroup.extend([ortholog['Orthogroup'][i]] * len(loci))
  SpeciesName.extend([ortholog['SpeciesName'][i]] * len(loci))

ortholog_df = pd.DataFrame(list(zip(Orthogroup, SpeciesName, Locus)),
                           columns =['orthogroup', 'SpeciesName','locus'])  
ortholog_df = ortholog_df.iloc[1:,] #remove first row

In [185]:
ortholog_df.shape #82706 genes, matches the Statistics file

(82706, 3)

In [184]:
# 2 SpeciesNames missing punction in ortholog file
ortholog_df.loc[ ortholog_df["SpeciesName"] == 'Desulfovibrio vulgaris str.  Miyazaki F ', "SpeciesName"] = "Desulfovibrio vulgaris str. 'Miyazaki F'"
ortholog_df.loc[ ortholog_df["SpeciesName"] == 'Escherichia coli O25b H4-ST131', "SpeciesName"] = "Escherichia coli O25b:H4-ST131"

In [189]:
# map orthogroup to original dataframe
df_ortholog = pd.merge(
              data,
              ortholog_df,
              how="left",
              on=["SpeciesName","locus"]           
          )

In [190]:
df_ortholog.shape

(92624, 14)

In [193]:
# check that there are the same number of matched genes to ortholog groups in merged dataset
df_ortholog[df_ortholog['orthogroup'].notnull()].shape[0]

82706

In [195]:
filename = 'df_ortholog.csv'
df_ortholog.to_csv(os.path.join(base_dir, filename))