In [4]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

In [5]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans 
from sklearn.preprocessing import StandardScaler

In [6]:
%matplotlib inline

In [7]:
#open file and get an array of each line
file = open(r'ccrescentus.fa','r')
data_gene = file.readlines()
file.close()

Data Preparation

In [8]:
#strip all white spaces and remove header and return a concatenated string
geneticcode = ''
for line in data_gene[1:]:
    geneticcode += line.strip()



In [9]:
#count the presence of each genome
aCount = geneticcode.count('a')
gCount = geneticcode.count('g')
cCount = geneticcode.count('c')
tCount = geneticcode.count('t')

snip = geneticcode[0:30]
print('Snip of code \n'+ snip)
print((snip.count('a')+snip.count('g')+snip.count('c')+snip.count('t')) == len(snip))

Snip of code 
gccgatagcctatgatccccggcaggcccg
True


Converting Text to a Numerical Table

In [10]:
#cut string into 300 characters each
size = 300
i = 0
cut_data = []
while i != len(geneticcode):
    if len(geneticcode)-i <size:
        cut_data.append(geneticcode[i:]) 
    cut_data.append(geneticcode[i:i+size])
    i+=size
cut_data

['gccgatagcctatgatccccggcaggcccggggcttggagccgtctggtttggatggaaccctccaaaccagatcaagaggctcctagaacgccgcccgcagggtcacgccccaggtgcgcgggtcgcccggctggccggcgatcaggccggtgttgctgggacccacggccagttgctcgaaatagttctcgtcgaaggcgttgcggacccaggcatagaggttcagcccctcaggcgtgcggaagccggcccggaagttagcgatcgtgtagccgtcaacccaggtgtagatcgaggg',
 'cgaagggttggacgagaacttgctgcggtagctgccgtcatagccgacatagaactcgcccgtcttgcccaggaagtcgcccggaacattggcctcggcccccagggccacgctccacttcgaaacgcccggcagccgctgacccgagacgtcgcagttggcggggctgagcgcgccggccacgccggccgcgcgcggaacctgggttccggtcgccaccgtgccgcccgacagttccggcgggcagggcgcgtcgacaaagcgcacatacttggcgtcggtataggcggcgttcagata',
 'ggtcgagaagcgggcgttcgggcgataggccgagtccagctccacgccttgggtgcgcaccttgccggcgttggccagatagccgcgcagcacgcccagctggccgttgctcaccgtggcctggtagttcttgatgtcgctgcggaacaccgcaaggttggccgtcagggtgcggtccagccactgggtcttcaggccgccctcgaagtgattgatgtcctcgggcttgatcgcgcctgcggcctcgatcggcttgcccgccgcatcggtcggcaggccgttctggttgatgccgccggt',
 'cttgaagctcttggcgtaggtggcgtaggccaggacgtcacggtccagctggtagcttgccgacaggtcgtaggtgaagttcc

In [11]:
#Create a combination of lists with words possible
from itertools import product

letters = ['a','g','c','t']
wordbank1 = []
wordbank2 = []
wordbank3 = []
wordbank4 = []
length4 = product(letters,repeat=4)
length3 = product(letters,repeat=3)
length2 = product(letters,repeat=2)
length1 = product(letters,repeat=1)

wordDict = {}
lettersSize = range(1,5)
for item in length1:
    s = ''
    for comboletters in item:
        s += comboletters
    wordbank1.append(s)

for item in length2:
    s = ''
    for comboletters in item:
        s += comboletters
    wordbank2.append(s)

for item in length3:
    s = ''
    for comboletters in item:
        s += comboletters
    wordbank3.append(s)

for item in length4:
    s = ''
    for comboletters in item:
        s += comboletters
    wordbank4.append(s)

wordDict = {1:wordbank1,2:wordbank2,3:wordbank3,4:wordbank4}
wordDict[1]
wordDict[2]



['aa',
 'ag',
 'ac',
 'at',
 'ga',
 'gg',
 'gc',
 'gt',
 'ca',
 'cg',
 'cc',
 'ct',
 'ta',
 'tg',
 'tc',
 'tt']

Count sequence of characters in Genetic Data

freq = {}

#create a data table for worddictionary values
for key in wordDict:
    gene_df = pd.DataFrame(columns=wordDict[key]) #look through value and add as column to the table

    for index,letters in enumerate(geneticcode):
        gene_df.loc[index] = np.zeros(len(wordDict[key]))
#iterate through each key then all the values
        while len(letters)> 0: #iterate through genetic string
            #check to see if dictionary value in genetic string
            
            x = geneticcode[0:key]
            gene_df.loc[index,x] +=1
                #count the value and update the table
            letters = letters[key:]



    freq[key] = gene_df
freq[2]



In [14]:
freqTables = {}
for i in range(1,5):
    # create an empty dataFrame with columns being the words on the dictionary
    df = pd.DataFrame(columns = wordDict[i])
    for index, dataP in enumerate(cut_data):
        # we create a row with zero values corresponding to a data point
        df.loc[index] = np.zeros(len(wordDict[i]))
        while len(dataP) > 0:
            # get the left part of the data point (i characters)
            left = dataP[0:i]
            # find it in the respective column and count it there
            df.loc[index, left] += 1
            dataP = dataP[i:]
    freqTables[i] = df

freqTables[2].head()

Unnamed: 0,aa,ag,ac,at,ga,gg,gc,gt,ca,cg,cc,ct,ta,tg,tc,tt
0,6.0,17.0,6.0,6.0,4.0,20.0,16.0,11.0,6.0,10.0,21.0,3.0,3.0,6.0,10.0,5.0
1,7.0,8.0,6.0,2.0,5.0,17.0,22.0,8.0,12.0,19.0,17.0,7.0,4.0,6.0,4.0,6.0
2,0.0,11.0,5.0,5.0,8.0,14.0,21.0,7.0,6.0,19.0,14.0,5.0,1.0,12.0,13.0,9.0
3,4.0,10.0,7.0,0.0,6.0,17.0,20.0,11.0,9.0,18.0,9.0,9.0,6.0,13.0,6.0,5.0
4,3.0,14.0,5.0,4.0,12.0,14.0,16.0,8.0,7.0,12.0,10.0,11.0,0.0,15.0,11.0,8.0


In [None]:
0,1,2,3,4,5,6,7
1,2,3,4,5,6,7
