# Predicting Viral Host from Codon Usage Bias

### Kathleen Hablutzel
### BIO 334/335 Bioinformatics Final Project
Run this notebook to separate the generated data into full and biased genome datasets.

In [1]:
import pandas as pd

import fasta_parser as fp

In [2]:
fp.clean_working_csv()

In [3]:
# load data
bias_pd = pd.read_csv("data/working_clean.csv")
bias_pd.drop_duplicates(inplace=True)
bias_pd.shape

(22082, 73)

In [4]:
bias_pd.head()

Unnamed: 0,UUU,UUC,UUA,UUG,CUU,CUC,CUA,CUG,AUU,AUC,...,GGG,BiasedRegionsOnly,AccessionNum,SeqLen,BiasedSeqLen,PropBiasedRegions,Nc,BiasedNc,Species,Taxon
0,0.457776,0.542224,0.073146,0.136493,0.13813,0.190055,0.078648,0.383528,0.354512,0.476869,...,0.236004,full,GCF_903992535.2_mArvAmp1.2,,77272509,,,54.457136,Arvicola_amphibius,vertebrate_mammalian
1,0.357377,0.642623,0.035458,0.104243,0.099131,0.213422,0.065184,0.482563,0.27733,0.614248,...,0.2514,biased,GCF_903992535.2_mArvAmp1.2,77272509.0,26243760,0.3396,54.457136,49.593948,Arvicola_amphibius,vertebrate_mammalian
2,0.474425,0.525575,0.085792,0.133677,0.136818,0.189876,0.069925,0.383912,0.363281,0.459382,...,0.251075,full,GCF_000493695.1_BalAcu1.0,,69532407,,,54.651624,Balaenoptera_acutorostrata,vertebrate_mammalian
3,0.365337,0.634663,0.050157,0.098445,0.096961,0.220268,0.049994,0.484176,0.278849,0.594489,...,0.282885,biased,GCF_000493695.1_BalAcu1.0,69532407.0,30961362,0.4453,54.651624,49.07754,Balaenoptera_acutorostrata,vertebrate_mammalian
4,0.475298,0.524702,0.088171,0.135714,0.13921,0.191816,0.068511,0.376578,0.366103,0.452923,...,0.24799,full,GCF_000754665.1_Bison_UMD1.0,,62252778,,,54.839233,Bison_bison,vertebrate_mammalian


In [5]:
# keep full genomes
bias_pd_full = bias_pd[(bias_pd["BiasedRegionsOnly"] == "full") & (bias_pd["BiasedSeqLen"] > 0)]
bias_pd_full = bias_pd_full.drop(columns=["BiasedRegionsOnly", "SeqLen", "PropBiasedRegions", "Nc"])
bias_pd_full = bias_pd_full.rename(columns={"BiasedSeqLen": "SeqLen", "BiasedNc": "Nc"})
bias_pd_full.shape

(10916, 69)

In [6]:
bias_pd_full.head()

Unnamed: 0,UUU,UUC,UUA,UUG,CUU,CUC,CUA,CUG,AUU,AUC,...,AGG,GGU,GGC,GGA,GGG,AccessionNum,SeqLen,Nc,Species,Taxon
0,0.457776,0.542224,0.073146,0.136493,0.13813,0.190055,0.078648,0.383528,0.354512,0.476869,...,0.219182,0.180696,0.317858,0.265441,0.236004,GCF_903992535.2_mArvAmp1.2,77272509,54.457136,Arvicola_amphibius,vertebrate_mammalian
2,0.474425,0.525575,0.085792,0.133677,0.136818,0.189876,0.069925,0.383912,0.363281,0.459382,...,0.219056,0.164273,0.325674,0.258978,0.251075,GCF_000493695.1_BalAcu1.0,69532407,54.651624,Balaenoptera_acutorostrata,vertebrate_mammalian
4,0.475298,0.524702,0.088171,0.135714,0.13921,0.191816,0.068511,0.376578,0.366103,0.452923,...,0.218796,0.168386,0.318368,0.265256,0.24799,GCF_000754665.1_Bison_UMD1.0,62252778,54.839233,Bison_bison,vertebrate_mammalian
6,0.453638,0.546362,0.078937,0.127617,0.129814,0.198149,0.064855,0.400627,0.351219,0.479417,...,0.211618,0.159878,0.338769,0.247034,0.254319,GCF_000247795.1_Bos_indicus_1.0,64918074,53.953278,Bos_indicus,vertebrate_mammalian
8,0.449071,0.550929,0.077686,0.128257,0.129594,0.197832,0.066708,0.399924,0.351122,0.482238,...,0.205152,0.157745,0.345172,0.241409,0.255674,GCF_002288905.1_ASM228890v2,67099479,53.983387,Enhydra_lutris,vertebrate_mammalian


In [7]:
bias_pd_full.to_csv("datasets/cub_full_genome.csv", index=False)

In [8]:
test = pd.read_csv("datasets/cub_full_genome.csv")
test.head()

Unnamed: 0,UUU,UUC,UUA,UUG,CUU,CUC,CUA,CUG,AUU,AUC,...,AGG,GGU,GGC,GGA,GGG,AccessionNum,SeqLen,Nc,Species,Taxon
0,0.457776,0.542224,0.073146,0.136493,0.13813,0.190055,0.078648,0.383528,0.354512,0.476869,...,0.219182,0.180696,0.317858,0.265441,0.236004,GCF_903992535.2_mArvAmp1.2,77272509,54.457136,Arvicola_amphibius,vertebrate_mammalian
1,0.474425,0.525575,0.085792,0.133677,0.136818,0.189876,0.069925,0.383912,0.363281,0.459382,...,0.219056,0.164273,0.325674,0.258978,0.251075,GCF_000493695.1_BalAcu1.0,69532407,54.651624,Balaenoptera_acutorostrata,vertebrate_mammalian
2,0.475298,0.524702,0.088171,0.135714,0.13921,0.191816,0.068511,0.376578,0.366103,0.452923,...,0.218796,0.168386,0.318368,0.265256,0.24799,GCF_000754665.1_Bison_UMD1.0,62252778,54.839233,Bison_bison,vertebrate_mammalian
3,0.453638,0.546362,0.078937,0.127617,0.129814,0.198149,0.064855,0.400627,0.351219,0.479417,...,0.211618,0.159878,0.338769,0.247034,0.254319,GCF_000247795.1_Bos_indicus_1.0,64918074,53.953278,Bos_indicus,vertebrate_mammalian
4,0.449071,0.550929,0.077686,0.128257,0.129594,0.197832,0.066708,0.399924,0.351122,0.482238,...,0.205152,0.157745,0.345172,0.241409,0.255674,GCF_002288905.1_ASM228890v2,67099479,53.983387,Enhydra_lutris,vertebrate_mammalian


In [9]:
test.shape

(10916, 69)

In [10]:
# just keep genomes using biased regions
bias_pd_biased = bias_pd[(bias_pd["BiasedRegionsOnly"] == "biased") & (bias_pd["BiasedSeqLen"] > 0)]
bias_pd_biased = bias_pd_biased.drop(columns=["BiasedRegionsOnly"])
bias_pd_biased.shape

(8308, 72)

In [11]:
bias_pd_biased.head()

Unnamed: 0,UUU,UUC,UUA,UUG,CUU,CUC,CUA,CUG,AUU,AUC,...,GGA,GGG,AccessionNum,SeqLen,BiasedSeqLen,PropBiasedRegions,Nc,BiasedNc,Species,Taxon
1,0.357377,0.642623,0.035458,0.104243,0.099131,0.213422,0.065184,0.482563,0.27733,0.614248,...,0.200728,0.2514,GCF_903992535.2_mArvAmp1.2,77272509.0,26243760,0.3396,54.457136,49.593948,Arvicola_amphibius,vertebrate_mammalian
3,0.365337,0.634663,0.050157,0.098445,0.096961,0.220268,0.049994,0.484176,0.278849,0.594489,...,0.188971,0.282885,GCF_000493695.1_BalAcu1.0,69532407.0,30961362,0.4453,54.651624,49.07754,Balaenoptera_acutorostrata,vertebrate_mammalian
5,0.371288,0.628712,0.05407,0.102735,0.100069,0.220455,0.050968,0.471703,0.288741,0.580296,...,0.196635,0.278049,GCF_000754665.1_Bison_UMD1.0,62252778.0,26328582,0.4229,54.839233,49.899251,Bison_bison,vertebrate_mammalian
7,0.348407,0.651593,0.044989,0.095981,0.091464,0.225333,0.047891,0.494342,0.273083,0.609025,...,0.182954,0.28246,GCF_000247795.1_Bos_indicus_1.0,64918074.0,31421865,0.484,53.953278,48.318396,Bos_indicus,vertebrate_mammalian
9,0.339304,0.660696,0.041498,0.094636,0.088775,0.225838,0.048453,0.500799,0.266361,0.621156,...,0.172288,0.287098,GCF_002288905.1_ASM228890v2,67099479.0,32735730,0.4879,53.983387,47.681397,Enhydra_lutris,vertebrate_mammalian


In [12]:
bias_pd_biased.to_csv("datasets/cub_biased_genome.csv", index=False)

In [13]:
test2 = pd.read_csv("datasets/cub_biased_genome.csv")
test2.head()

Unnamed: 0,UUU,UUC,UUA,UUG,CUU,CUC,CUA,CUG,AUU,AUC,...,GGA,GGG,AccessionNum,SeqLen,BiasedSeqLen,PropBiasedRegions,Nc,BiasedNc,Species,Taxon
0,0.357377,0.642623,0.035458,0.104243,0.099131,0.213422,0.065184,0.482563,0.27733,0.614248,...,0.200728,0.2514,GCF_903992535.2_mArvAmp1.2,77272509.0,26243760,0.3396,54.457136,49.593948,Arvicola_amphibius,vertebrate_mammalian
1,0.365337,0.634663,0.050157,0.098445,0.096961,0.220268,0.049994,0.484176,0.278849,0.594489,...,0.188971,0.282885,GCF_000493695.1_BalAcu1.0,69532407.0,30961362,0.4453,54.651624,49.07754,Balaenoptera_acutorostrata,vertebrate_mammalian
2,0.371288,0.628712,0.05407,0.102735,0.100069,0.220455,0.050968,0.471703,0.288741,0.580296,...,0.196635,0.278049,GCF_000754665.1_Bison_UMD1.0,62252778.0,26328582,0.4229,54.839233,49.899251,Bison_bison,vertebrate_mammalian
3,0.348407,0.651593,0.044989,0.095981,0.091464,0.225333,0.047891,0.494342,0.273083,0.609025,...,0.182954,0.28246,GCF_000247795.1_Bos_indicus_1.0,64918074.0,31421865,0.484,53.953278,48.318396,Bos_indicus,vertebrate_mammalian
4,0.339304,0.660696,0.041498,0.094636,0.088775,0.225838,0.048453,0.500799,0.266361,0.621156,...,0.172288,0.287098,GCF_002288905.1_ASM228890v2,67099479.0,32735730,0.4879,53.983387,47.681397,Enhydra_lutris,vertebrate_mammalian


In [14]:
test2.shape

(8308, 72)