In [1]:
## Features
# GC Content
# CpG Islands
# CpG Islands per ORF Length
# Transcript Length
# Accumulated Nucleotide Frequency A
# Accumulated Nucleotide Frequency G
# Accumulated Nucleotide Frequency T
# Accumulated Nucleotide Frequency C
# Relative Codon Bias
# Stop Codon Frequency
# Isoelectric Point
# Molecular Weight
# Gravy
# Instability Index
# Stop Frame Score
# Stop Codon Frequency
# GC per ORF Length
# ORF Length
# Fickett Score

In [2]:
import re
import pandas as pd
import numpy as np
from Bio.SeqUtils import ProtParam
from Bio.Seq import Seq
from Bio import SeqIO

In [3]:
def load_file(filename):
    seq_tuple = []
    count = 0
    with open(filename) as f:
        header = ""
        sequence = ""
        for line in f:
            line = line.strip()  #remove white spaces
            
            if line.startswith('>'):
                if sequence != '':
                    #to filter sequences by their length
#                     seq_len = len(sequence)
#                     if seq_len >= 200 and seq_len <= 3000:
                        seq_tuple.append((header, sequence))
                header = line[1:]
                sequence = ''
                count += 1
                
            else:
                sequence += line
                
        # Check the last sequence
        seq_len = len(sequence)
        if seq_len >= 300 and seq_len <= 1000:
            seq_tuple.append((header, sequence))
        return seq_tuple, count

In [4]:
coding = "coding.fa"
non_coding = "noncoding.fa"

In [5]:
coding_list, coding_count = load_file(coding)  # function call on coding dataset

In [6]:
 noncoding_list, noncoding_count = load_file(non_coding)   # function call on noncoding dataset

In [7]:
len(noncoding_list)

173111

In [8]:
len(coding_list)

111275

In [9]:
coding_count

111276

In [10]:
noncoding_count

173112

## Fickett Score

In [11]:
def look_up_position_prob(value, base, position_para, position_prob, position_weight):

	"""look up positional probability by base and value"""

	if float(value) < 0:
		return None
	for idx, val in enumerate(position_para):
		if float(value) >= val:
			return float(position_prob[base][idx]) * float(position_weight[base])


def look_up_content_prob(value, base, content_para, content_prob, content_weight):

	"""look up content probability by base and value"""

	if float(value) < 0:
		return None
	for idx, val in enumerate(content_para):
		if float(value) >= val:
			return float(content_prob[base][idx]) * float(content_weight[base])
        
def fickett_value_full_sequence(seq, type_seq):

	"""calculate Fickett from full sequence - CPC2"""

	position_para = [1.9, 1.8, 1.7, 1.6, 1.5, 1.4, 1.3, 1.2, 1.1, 0.0]
	content_para = [0.33, 0.31, 0.29, 0.27, 0.25, 0.23, 0.21, 0.19, 0.17, 0]

	position_prob = {
		'A': [0.51, 0.55, 0.57, 0.52, 0.48, 0.58, 0.57, 0.54, 0.50, 0.36],
		'C': [0.29, 0.44, 0.55, 0.49, 0.52, 0.60, 0.60, 0.56, 0.51, 0.38],
		'G': [0.62, 0.67, 0.74, 0.65, 0.61, 0.62, 0.52, 0.41, 0.31, 0.17],
		'T': [0.51, 0.60, 0.69, 0.64, 0.62, 0.67, 0.58, 0.48, 0.39, 0.24]}
	
	position_weight = {'A': 0.062, 'C': 0.093, 'G': 0.205, 'T': 0.154}
	content_weight = {'A': 0.084, 'C': 0.076, 'G': 0.081, 'T': 0.055}

	content_prob = {
		'A': [0.40, 0.55, 0.58, 0.58, 0.52, 0.48, 0.45, 0.45, 0.38, 0.19],
		'C': [0.50, 0.63, 0.59, 0.50, 0.46, 0.45, 0.47, 0.56, 0.59, 0.33],
		'G': [0.21, 0.40, 0.47, 0.50, 0.52, 0.56, 0.57, 0.52, 0.44, 0.23],
		'T': [0.30, 0.49, 0.56, 0.53, 0.48, 0.48, 0.52, 0.57, 0.60, 0.51]}

	if len(seq) < 2:
		return 0

	fickett_score = 0
	seq = seq.upper()
	total_base = len(seq)

	phase_0 = seq[::3]
	phase_1 = seq[1::3]
	phase_2 = seq[2::3]

	phase_0_A = phase_0.count('A')
	phase_1_A = phase_1.count('A')
	phase_2_A = phase_2.count('A')
	phase_0_C = phase_0.count('C')
	phase_1_C = phase_1.count('C')
	phase_2_C = phase_2.count('C')
	phase_0_G = phase_0.count('G')
	phase_1_G = phase_1.count('G')
	phase_2_G = phase_2.count('G')
	if type_seq == 1:
		phase_0_T = phase_0.count('T')
		phase_1_T = phase_1.count('T')
		phase_2_T = phase_2.count('T')
	else:
		phase_0_T = phase_0.count('U')
		phase_1_T = phase_1.count('U')
		phase_2_T = phase_2.count('U')

	A_content = float(phase_0_A + phase_1_A + phase_2_A) / total_base
	C_content = float(phase_0_C + phase_1_C + phase_2_C) / total_base
	G_content = float(phase_0_G + phase_1_G + phase_2_G) / total_base
	T_content = float(phase_0_T + phase_1_T + phase_2_T) / total_base
	A_position = max([phase_0_A, phase_1_A, phase_2_A]) / (min([phase_0_A, phase_1_A, phase_2_A]) + 1.0)
	C_position = max([phase_0_C, phase_1_C, phase_2_C]) / (min([phase_0_C, phase_1_C, phase_2_C]) + 1.0)
	G_position = max([phase_0_G, phase_1_G, phase_2_G]) / (min([phase_0_G, phase_1_G, phase_2_G]) + 1.0)
	T_position = max([phase_0_T, phase_1_T, phase_2_T]) / (min([phase_0_T, phase_1_T, phase_2_T]) + 1.0)

	fickett_score += look_up_content_prob(A_content, 'A', content_para, content_prob, content_weight)
	fickett_score += look_up_content_prob(C_content, 'C', content_para, content_prob, content_weight)
	fickett_score += look_up_content_prob(G_content, 'G', content_para, content_prob, content_weight)
	fickett_score += look_up_content_prob(T_content, 'T', content_para, content_prob, content_weight)

	fickett_score += look_up_position_prob(A_position, 'A', position_para, position_prob, position_weight)
	fickett_score += look_up_position_prob(C_position, 'C', position_para, position_prob, position_weight)
	fickett_score += look_up_position_prob(G_position, 'G', position_para, position_prob, position_weight)
	fickett_score += look_up_position_prob(T_position, 'T', position_para, position_prob, position_weight)

	return fickett_score


## Accumulated Nucleotide Frequency for A

In [12]:
import numpy as np

def accumulated_nucleotide_frequency(sequence):
    sequence = sequence.upper()
    mapping = {'A': [], 'C': [], 'T': [], 'G': []}
    counts = {'A': 0, 'C': 0, 'T': 0, 'G': 0}

    for i in range(len(sequence)):
        nucleotide = sequence[i]
        counts[nucleotide] += 1
        for nt, count in counts.items():
            mapping[nt].append(count / (i + 1))

    return mapping

def accumulated_frequency_A(sequence):
    return accumulated_nucleotide_frequency(sequence)['A'][-1]

# def accumulated_frequency_C(sequence):
#     return accumulated_nucleotide_frequency(sequence)['C']

# def accumulated_frequency_T(sequence):
#     return accumulated_nucleotide_frequency(sequence)['T']

# def accumulated_frequency_G(sequence):
#     return accumulated_nucleotide_frequency(sequence)['G']


## Accumulated Nucleotide Frequency for G

In [13]:
import numpy as np

def accumulated_nucleotide_frequency(sequence):
    sequence = sequence.upper()
    mapping = {'A': [], 'C': [], 'T': [], 'G': []}
    counts = {'A': 0, 'C': 0, 'T': 0, 'G': 0}

    for i in range(len(sequence)):
        nucleotide = sequence[i]
        counts[nucleotide] += 1
        for nt, count in counts.items():
            mapping[nt].append(count / (i + 1))

    return mapping

def accumulated_frequency_G(sequence):
    return accumulated_nucleotide_frequency(sequence)['G'][-1]

# def accumulated_frequency_C(sequence):
#     return accumulated_nucleotide_frequency(sequence)['C']

# def accumulated_frequency_T(sequence):
#     return accumulated_nucleotide_frequency(sequence)['T']

# def accumulated_frequency_G(sequence):
#     return accumulated_nucleotide_frequency(sequence)['G']


## Accumulated Nucleotide Frequency for T

In [14]:
import numpy as np

def accumulated_nucleotide_frequency(sequence):
    sequence = sequence.upper()
    mapping = {'A': [], 'C': [], 'T': [], 'G': []}
    counts = {'A': 0, 'C': 0, 'T': 0, 'G': 0}

    for i in range(len(sequence)):
        nucleotide = sequence[i]
        counts[nucleotide] += 1
        for nt, count in counts.items():
            mapping[nt].append(count / (i + 1))

    return mapping

def accumulated_frequency_T(sequence):
    return accumulated_nucleotide_frequency(sequence)['T'][-1]

# def accumulated_frequency_C(sequence):
#     return accumulated_nucleotide_frequency(sequence)['C']

# def accumulated_frequency_T(sequence):
#     return accumulated_nucleotide_frequency(sequence)['T']

# def accumulated_frequency_G(sequence):
#     return accumulated_nucleotide_frequency(sequence)['G']


## Accumulated Nucleotide Frequency for C

In [15]:
import numpy as np

def accumulated_nucleotide_frequency(sequence):
    sequence = sequence.upper()
    mapping = {'A': [], 'C': [], 'T': [], 'G': []}
    counts = {'A': 0, 'C': 0, 'T': 0, 'G': 0}

    for i in range(len(sequence)):
        nucleotide = sequence[i]
        counts[nucleotide] += 1
        for nt, count in counts.items():
            mapping[nt].append(count / (i + 1))

    return mapping

def accumulated_frequency_C(sequence):
    return accumulated_nucleotide_frequency(sequence)['C'][-1]

# def accumulated_frequency_C(sequence):
#     return accumulated_nucleotide_frequency(sequence)['C']

# def accumulated_frequency_T(sequence):
#     return accumulated_nucleotide_frequency(sequence)['T']

# def accumulated_frequency_G(sequence):
#     return accumulated_nucleotide_frequency(sequence)['G']


## GC Content

In [16]:
def gcContent(sequence):
    '''
    :param sequence: primary sequence of a transcript
    :return: the GC content of the sequence
    '''
    Gcontent = 0
    Ccontent = 0
    for base in sequence:
        if base == 'G':
            Gcontent += 1
        if base == 'C':
            Ccontent += 1
    GCcontent = (Gcontent+Ccontent)/len(sequence)
    return GCcontent

## Relative Codon Bias

In [17]:
    
def RCB_score(seq):
    condon_dict = {}         # a dictionary contain the counts of each condon
    condon_list = []         # a list contain the condons in a sequence
    Frequency_1 = {'A': 0,
                'T': 0,
                'G' : 0,
                'C' : 0}
    Frequency_2 = {'A': 0,
                'T': 0,
                'G' : 0,
                'C' : 0}
    Frequency_3 = {'A': 0,
                'T': 0,
                'G' : 0,
                'C' : 0}

    # First position
    for i in range(0,len(seq),3):
        if seq[i] == 'A':
            Frequency_1['A'] += 1
        elif seq[i] == 'T':
            Frequency_1['T'] += 1
        elif seq[i] == 'G':
            Frequency_1['G'] += 1
        elif seq[i] == 'C':
            Frequency_1['C'] += 1

        if len(seq[i:i+3]) != 3:
            continue
        elif seq[i:i+3] not in condon_dict:
            condon_dict[seq[i:i+3]] = 1
        else:
            condon_dict[seq[i:i+3]] += 1

        if len(seq[i:i+3]) == 3:
            condon_list.append(seq[i:i+3])

    # Second position
    for i in range(1,len(seq),3):
        if seq[i] == 'A':
            Frequency_2['A'] += 1
        elif seq[i] == 'T':
            Frequency_2['T'] += 1
        elif seq[i] == 'G':
            Frequency_2['G'] += 1
        elif seq[i] == 'C':
            Frequency_2['C'] += 1

    # Third position
    for i in range(2,len(seq),3):
        if seq[i] == 'A':
            Frequency_3['A'] += 1
        elif seq[i] == 'T':
            Frequency_3['T'] += 1
        elif seq[i] == 'G':
            Frequency_3['G'] += 1
        elif seq[i] == 'C':
            Frequency_3['C'] += 1

    total_condons = len(condon_list)
    condon_sum = 0
    for k in condon_list:
        d = np.log(np.absolute((condon_dict[k]/total_condons) - (Frequency_1[k[0]]/total_condons) *
             (Frequency_2[k[1]]/total_condons) * (Frequency_3[k[2]]/total_condons)) / ((Frequency_1[k[0]]/total_condons) *
             (Frequency_2[k[1]]/total_condons) * (Frequency_3[k[2]]/total_condons)) + 1)
        condon_sum = condon_sum + d

    RCB = np.exp(condon_sum/len(condon_list))-1
    return RCB



## Translation function

In [18]:
def translation(sequence):
    '''
    :param sequence: input primary sequence of a transcript
    :return: protein sequence
    '''
    trans_dic = {'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'CTT': 'L',
                 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'ATT': 'I', 'ATC': 'I',
                 'ATA': 'I', 'ATG': 'M', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V',
                 'GTG': 'V', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
                 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'ACT': 'T',
                 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'GCT': 'A', 'GCC': 'A',
                 'GCA': 'A', 'GCG': 'A', 'TAT': 'Y', 'TAC': 'Y', 'TAA': '*',
                 'TAG': '*', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
                 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'GAT': 'D',
                 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'TGT': 'C', 'TGC': 'C',
                 'TGA': '*', 'TGG': 'W', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R',
                 'CGG': 'R', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
                 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'}
    protein = ''
    for i in range(0, len(sequence)-2, 3):
        protein += trans_dic[sequence[i:i+3]]
    return protein

## ORF Length

In [19]:
def ORFfinder_T0(sequence):
    # 3 frame translation
    trans1 = translation(sequence)
    trans2 = translation(sequence[1:])
    trans3 = translation(sequence[2:])

    orf1 = re.finditer(r'M.*?\*', trans1)
    orf2 = re.finditer(r'M.*?\*', trans2)
    orf3 = re.finditer(r'M.*?\*', trans3)
    orf1_seqs = [(m.start(), m.end(), m.group(), 'frame1') for m in orf1]
    orf2_seqs = [(m.start(), m.end(), m.group(), 'frame2') for m in orf2]
    orf3_seqs = [(m.start(), m.end(), m.group(), 'frame3') for m in orf3]

    orfs = orf1_seqs + orf2_seqs + orf3_seqs
    if len(orfs) == 0:
#         return ('NaN', 'NaN'), 'NaN', 0
        return 0
    else:
        orf_sorted = sorted(orfs, key=lambda t: len(t[2]), reverse=True)
        longest_orf_protein_seq = orf_sorted[0][2]
        longest_orf_frame = orf_sorted[0][3]
        longest_orf_pro_position = (int(orf_sorted[0][0]), int(orf_sorted[0][1]))
        if longest_orf_frame == 'frame1':
            longest_orf_rna_seq_position = (longest_orf_pro_position[0] * 3, longest_orf_pro_position[1] * 3)
            longest_orf_rna_seq = sequence[longest_orf_rna_seq_position[0]:longest_orf_rna_seq_position[1]]
            longest_orf_rna_length = longest_orf_rna_seq_position[1] - longest_orf_rna_seq_position[0]
            return longest_orf_rna_length
#             return longest_orf_rna_seq_position, longest_orf_rna_seq, longest_orf_rna_length
        elif longest_orf_frame == 'frame2':
            longest_orf_rna_seq_position = (longest_orf_pro_position[0] * 3, longest_orf_pro_position[1] * 3)
            longest_orf_rna_seq = sequence[1:][longest_orf_rna_seq_position[0]:longest_orf_rna_seq_position[1]]
            longest_orf_rna_seq_original_position = (
            longest_orf_pro_position[0] * 3 + 1, longest_orf_pro_position[1] * 3 + 1)
            longest_orf_rna_length = longest_orf_rna_seq_original_position[1] - longest_orf_rna_seq_original_position[0]
            return longest_orf_rna_length
#             return longest_orf_rna_seq_original_position, longest_orf_rna_seq, longest_orf_rna_length
        elif longest_orf_frame == 'frame3':
            longest_orf_rna_seq_position = (longest_orf_pro_position[0] * 3, longest_orf_pro_position[1] * 3)
            longest_orf_rna_seq = sequence[2:][longest_orf_rna_seq_position[0]:longest_orf_rna_seq_position[1]]
            longest_orf_rna_seq_original_position = (
            longest_orf_pro_position[0] * 3 + 2, longest_orf_pro_position[1] * 3 + 2)
            longest_orf_rna_length = longest_orf_rna_seq_original_position[1] - longest_orf_rna_seq_original_position[0]
            return longest_orf_rna_length
#             return longest_orf_rna_seq_original_position, longest_orf_rna_seq, longest_orf_rna_length



## Isoelectric Point

In [20]:
def cal_protein_features_pi(seq):
    '''
    :param seq: input primary sequence of a transcript
    :return: pi: (protein isoelectric),
             mw: (molecular weight)
             # gravy: (grand average of hydropathy)
             aromaticity: (relative frequency of Phe+Trp+Tyr)
             instability: (instability index --
                          Any value above 40 means the protein is unstable (=has a short half life)).
    '''
    protein_seq = translation(seq).strip('*')
    protein_object = ProtParam.ProteinAnalysis(protein_seq)
    pi = protein_object.isoelectric_point()
    return pi

## Aromaticity

In [21]:
def cal_protein_features_aromaticity(seq):
    '''
    :param seq: input primary sequence of a transcript
    :return: pi: (protein isoelectric),
             mw: (molecular weight)
             # gravy: (grand average of hydropathy)
             aromaticity: (relative frequency of Phe+Trp+Tyr)
             instability: (instability index --
                          Any value above 40 means the protein is unstable (=has a short half life)).
    '''

    protein_seq = translation(seq).strip('*')
    protein_object = ProtParam.ProteinAnalysis(protein_seq)
    aromaticity = protein_object.aromaticity()
    return aromaticity

## Transcript Length

In [22]:
def transcript_length(seq):
    return len(seq)

## CpG Islands

In [23]:
def calculate_cpg_islands(seq):
    cpg_islands = []
    cpg_count = 0
    in_island = False
    
    for i in range(len(seq)-1):
        if seq[i:i+2].upper() == 'CG':
            cpg_count += 1
            if not in_island:
                in_island = True
        else:
            if cpg_count > 0:
                cpg_islands.append(cpg_count)
                cpg_count = 0
                in_island = False
    
    if cpg_count > 0:
        cpg_islands.append(cpg_count)
    
    return cpg_islands

## Stop Codon Frequency

In [24]:
def get_stop_codon_num(seq):
    translate_prot = Seq(seq).translate()
    stop_num = translate_prot.count("*")
    return stop_num

def get_stop_codon_frequency(seq):
    stop_num = get_stop_codon_num(seq)
    transript_length = np.log(len(seq)+1)
    stop_freq = float(stop_num) / transript_length
    return stop_freq
    

## Molecular Weight

In [25]:
def get_Mw(seq):
    seqprot = Seq(seq).translate()
    strinfoAmbiguous = re.compile("X|B|Z|J|U", re.I)
    newseqprot = strinfoAmbiguous.sub("", str(seqprot))
    protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot).replace("*",""))
    mw = protparam_obj.molecular_weight()
    return mw

## Gravy

In [26]:
def get_gravy(seq):
    seqprot = Seq(seq).translate()
    strinfoAmbiguous = re.compile("X|B|Z|J|U", re.I)
    newseqprot = strinfoAmbiguous.sub("", str(seqprot))
    protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot).replace("*", ""))
    Gravy = protparam_obj.gravy()
    return Gravy

## Insatbility Index

In [27]:
def get_instablility_index(seq):
    seqprot = Seq(seq).translate()
    strinfoAmbiguous = re.compile("X|B|Z|J|U", re.I)
    newseqprot = strinfoAmbiguous.sub("", str(seqprot))
    protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot).replace("*", ""))
    instablility_index = protparam_obj.instability_index()
    return instablility_index

## Stop Frame Score 

In [28]:
def get_stop_frame_score(seq):
    stop_num_in_frame1 = get_stop_codon_num(seq)
    stop_num_in_frame2 = get_stop_codon_num(seq[1:])
    stop_num_in_frame3 = get_stop_codon_num(seq[2:])
    stop_num_all = [stop_num_in_frame1, stop_num_in_frame2, stop_num_in_frame3]
    stop_num_frame = ((stop_num_all[0] - stop_num_all[1]) ** 2 + (stop_num_all[0] - stop_num_all[2]) ** 2 + (
            stop_num_all[1] - stop_num_all[2]) ** 2) / 2
    return stop_num_frame

In [1]:
####################################################################################################################

## Generating the Feature Arrays

In [30]:
def generate_feature(list_name):
    
    
    for header, sequence in list_name:        
        #calculate fickett score
        type_seq = 1
        fickett_score = fickett_value_full_sequence(sequence, type_seq)
        
        
        #Nucleotide Frequencies
        nucleiA = accumulated_frequency_A(sequence)
        nucleiG = accumulated_frequency_G(sequence)
        nucleiT = accumulated_frequency_T(sequence)
        nucleiC = accumulated_frequency_C(sequence)
        
        #calculate GC content
        gc = gcContent(sequence)
        
        # Calculate the number of CpG islands for this sequence
        cpg_islands = calculate_cpg_islands(sequence)
        num_islands = len(cpg_islands)
    
        #calculate transcript length
        length = transcript_length(sequence)
        
        #calculate ORF length
        orf_length = ORFfinder_T0(sequence)
        
        #stop codon freqeuncy
        scf = get_stop_codon_frequency(sequence)
        
        #molecular weight
        mw = get_Mw(sequence)
        
        #gravy
        gv = get_gravy(sequence)
        
        #instability index
        ii = get_instablility_index(sequence)
        
        # stop frame score
        sframesc = get_stop_frame_score(sequence)
        
        #calculate Relative Codon Bias , isoelectric point, aromaticity
        if orf_length != 0:
            rcb = RCB_score(sequence)
            pi = cal_protein_features_pi(sequence)
            arom = cal_protein_features_aromaticity(sequence)
        else:
            rcb = 0
            pi = 0
            arom = 0
        
        fickett_array.append(fickett_score)
        nucleiA_array.append(nucleiA)
        nucleiG_array.append(nucleiG)
        nucleiC_array.append(nucleiC)
        nucleiT_array.append(nucleiT)
        gc_array.append(gc)
        cpg_array.append(num_islands)
        transcript_array.append(length)
        orf_array.append(orf_length)
        rcb_array.append(rcb)
        ip_array.append(pi)
        scf_array.append(scf)
        mw_array.append(mw)
        gv_array.append(gv)
        ii_array.append(ii)
        sframesc_array.append(sframesc)
        aromaticity_array.append(arom)   
        
    

## Generating Coding Dataset

In [31]:
fickett_array = []
nucleiA_array = []
nucleiG_array = []
nucleiT_array = []
nucleiC_array = []
gc_array = []
orf_array = []
rcb_array=[]
ip_array = []
aromaticity_array = []
transcript_array = []
cpg_array = []
scf_array = []
mw_array = []
gv_array = []
ii_array = []
sframesc_array = []
generate_feature(coding_list) # generate feature set for coding dataset



In [32]:
names_dict = {'Fickett Score':fickett_array, 'NucleiA':nucleiA_array, 'NucleiG':nucleiG_array, 'NucleiT':nucleiT_array, 'NucleiC':nucleiC_array, 'GC content':gc_array, 'ORF Length':orf_array, 'Relative Codon Bias':rcb_array, 'Isoelectric Potential':ip_array, 'Aromaticity':aromaticity_array, 'Transcript Length':transcript_array, 'CpG Islands':cpg_array, 'SCF':scf_array, 'MW': mw_array, 'Gravy': gv_array, 'Instability Index': ii_array, 'Stop Frame': sframesc_array}
features_df = pd.DataFrame(names_dict)

In [33]:
features_df.head()

Unnamed: 0,Fickett Score,NucleiA,NucleiG,NucleiT,NucleiC,GC content,ORF Length,Relative Codon Bias,Isoelectric Potential,Aromaticity,Transcript Length,CpG Islands,SCF,MW,Gravy,Instability Index,Stop Frame
0,0.37617,0.321238,0.175707,0.319328,0.183728,0.359435,981,0.284229,9.588915,0.112385,2618,18,6.225742,95048.7649,0.173633,39.158603,112.0
1,0.4672,0.209798,0.202343,0.330138,0.257721,0.460064,939,0.46047,9.04261,0.137821,939,8,0.146073,35073.4005,0.755449,37.706122,172.0
2,0.4672,0.209798,0.202343,0.330138,0.257721,0.460064,939,0.46047,9.04261,0.137821,939,8,0.146073,35073.4005,0.755449,37.706122,172.0
3,0.41608,0.161616,0.341703,0.139105,0.357576,0.699278,2535,0.270346,11.87,0.051948,3465,302,2.821824,120496.2512,-0.734364,77.579329,252.0
4,0.41608,0.161765,0.341696,0.138985,0.357555,0.69925,2538,0.2715,11.87,0.051903,3468,302,2.821524,120624.3804,-0.736805,77.4406,252.0


In [34]:
features_df

Unnamed: 0,Fickett Score,NucleiA,NucleiG,NucleiT,NucleiC,GC content,ORF Length,Relative Codon Bias,Isoelectric Potential,Aromaticity,Transcript Length,CpG Islands,SCF,MW,Gravy,Instability Index,Stop Frame
0,0.37617,0.321238,0.175707,0.319328,0.183728,0.359435,981,0.284229,9.588915,0.112385,2618,18,6.225742,95048.7649,0.173633,39.158603,112.0
1,0.46720,0.209798,0.202343,0.330138,0.257721,0.460064,939,0.460470,9.042610,0.137821,939,8,0.146073,35073.4005,0.755449,37.706122,172.0
2,0.46720,0.209798,0.202343,0.330138,0.257721,0.460064,939,0.460470,9.042610,0.137821,939,8,0.146073,35073.4005,0.755449,37.706122,172.0
3,0.41608,0.161616,0.341703,0.139105,0.357576,0.699278,2535,0.270346,11.870000,0.051948,3465,302,2.821824,120496.2512,-0.734364,77.579329,252.0
4,0.41608,0.161765,0.341696,0.138985,0.357555,0.699250,2538,0.271500,11.870000,0.051903,3468,302,2.821524,120624.3804,-0.736805,77.440600,252.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111270,0.42861,0.294798,0.106936,0.303468,0.294798,0.401734,54,0.647228,4.454040,0.095652,346,9,0.683840,12314.8142,1.224324,49.757658,211.0
111271,0.42696,0.282828,0.121212,0.286195,0.309764,0.430976,297,0.843020,5.727063,0.071429,297,6,0.175528,10578.7250,1.526531,42.487755,169.0
111272,0.41737,0.301887,0.099419,0.255443,0.343251,0.442671,300,0.423575,9.402150,0.074074,1378,31,1.659954,48912.9066,0.910515,36.584609,1237.0
111273,0.41520,0.304084,0.105960,0.246689,0.343267,0.449227,201,0.358543,9.144276,0.091211,1812,40,1.599416,64562.5371,0.717568,31.892111,1597.0


### Adding a label for coding sequences

In [35]:
features_df["coding/noncoding"] = 1    #1 for coding sequences
features_df.head()

Unnamed: 0,Fickett Score,NucleiA,NucleiG,NucleiT,NucleiC,GC content,ORF Length,Relative Codon Bias,Isoelectric Potential,Aromaticity,Transcript Length,CpG Islands,SCF,MW,Gravy,Instability Index,Stop Frame,coding/noncoding
0,0.37617,0.321238,0.175707,0.319328,0.183728,0.359435,981,0.284229,9.588915,0.112385,2618,18,6.225742,95048.7649,0.173633,39.158603,112.0,1
1,0.4672,0.209798,0.202343,0.330138,0.257721,0.460064,939,0.46047,9.04261,0.137821,939,8,0.146073,35073.4005,0.755449,37.706122,172.0,1
2,0.4672,0.209798,0.202343,0.330138,0.257721,0.460064,939,0.46047,9.04261,0.137821,939,8,0.146073,35073.4005,0.755449,37.706122,172.0,1
3,0.41608,0.161616,0.341703,0.139105,0.357576,0.699278,2535,0.270346,11.87,0.051948,3465,302,2.821824,120496.2512,-0.734364,77.579329,252.0,1
4,0.41608,0.161765,0.341696,0.138985,0.357555,0.69925,2538,0.2715,11.87,0.051903,3468,302,2.821524,120624.3804,-0.736805,77.4406,252.0,1


In [36]:
features_df.shape

(111275, 18)

### To csv file

In [37]:
features_df.to_csv('coding_17_features.csv', index=False)

## Generating Non-Coding Dataset

In [38]:
fickett_array = []
nucleiA_array = []
nucleiG_array = []
nucleiT_array = []
nucleiC_array = []
gc_array = []
orf_array = []
rcb_array=[]
ip_array = []
aromaticity_array = []
transcript_array = []
cpg_array = []
scf_array = []
mw_array = []
gv_array = []
ii_array = []
sframesc_array = []
generate_feature(noncoding_list) # generate feature set for coding dataset



In [39]:
names_dict = {'Fickett Score':fickett_array, 'NucleiA':nucleiA_array, 'NucleiG':nucleiG_array, 'NucleiT':nucleiT_array, 'NucleiC':nucleiC_array, 'GC content':gc_array, 'ORF Length':orf_array, 'Relative Codon Bias':rcb_array, 'Isoelectric Potential':ip_array, 'Aromaticity':aromaticity_array, 'Transcript Length':transcript_array, 'CpG Islands':cpg_array, 'SCF':scf_array, 'MW': mw_array, 'Gravy': gv_array, 'Instability Index': ii_array, 'Stop Frame': sframesc_array}
features_df = pd.DataFrame(names_dict)

In [40]:
features_df.head()

Unnamed: 0,Fickett Score,NucleiA,NucleiG,NucleiT,NucleiC,GC content,ORF Length,Relative Codon Bias,Isoelectric Potential,Aromaticity,Transcript Length,CpG Islands,SCF,MW,Gravy,Instability Index,Stop Frame
0,0.33022,0.192982,0.245009,0.253479,0.30853,0.553539,390,0.46601,8.557163,0.079855,1653,23,1.754161,58803.5325,0.013569,69.028104,7.0
1,0.33326,0.193527,0.241403,0.25354,0.311531,0.552933,402,0.428935,8.525573,0.05668,1483,21,2.46491,50958.2792,-0.037395,61.60105,57.0
2,0.3485,0.213608,0.299051,0.205696,0.281646,0.580696,180,0.591729,8.961508,0.057143,632,10,0.930165,21749.5998,-0.254902,72.012255,3.0
3,0.34765,0.218935,0.333333,0.163708,0.284024,0.617357,165,0.659118,11.371079,0.059172,507,18,0.321002,18519.8711,-0.794012,70.119162,9.0
4,0.34143,0.20033,0.281121,0.201979,0.31657,0.597692,165,0.529159,10.728845,0.059406,1213,34,1.689742,41313.5081,-0.344133,75.329872,3.0


In [41]:
features_df

Unnamed: 0,Fickett Score,NucleiA,NucleiG,NucleiT,NucleiC,GC content,ORF Length,Relative Codon Bias,Isoelectric Potential,Aromaticity,Transcript Length,CpG Islands,SCF,MW,Gravy,Instability Index,Stop Frame
0,0.33022,0.192982,0.245009,0.253479,0.308530,0.553539,390,0.466010,8.557163,0.079855,1653,23,1.754161,58803.5325,0.013569,69.028104,7.0
1,0.33326,0.193527,0.241403,0.253540,0.311531,0.552933,402,0.428935,8.525573,0.056680,1483,21,2.464910,50958.2792,-0.037395,61.601050,57.0
2,0.34850,0.213608,0.299051,0.205696,0.281646,0.580696,180,0.591729,8.961508,0.057143,632,10,0.930165,21749.5998,-0.254902,72.012255,3.0
3,0.34765,0.218935,0.333333,0.163708,0.284024,0.617357,165,0.659118,11.371079,0.059172,507,18,0.321002,18519.8711,-0.794012,70.119162,9.0
4,0.34143,0.200330,0.281121,0.201979,0.316570,0.597692,165,0.529159,10.728845,0.059406,1213,34,1.689742,41313.5081,-0.344133,75.329872,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173106,0.34933,0.279215,0.243533,0.214095,0.263158,0.506690,165,0.404011,9.778000,0.050938,1121,29,1.993488,39898.6091,-0.465181,56.477437,7.0
173107,0.30073,0.267338,0.275168,0.197987,0.259508,0.534676,93,0.439534,9.988038,0.077441,894,38,2.059786,31774.3037,-0.603873,48.504577,7.0
173108,0.31353,0.262545,0.210125,0.218190,0.309140,0.519265,405,0.308874,9.880441,0.084677,2232,89,2.463980,80774.0628,-0.507172,53.483448,37.0
173109,0.30200,0.300735,0.219853,0.269853,0.209559,0.429412,207,0.469681,8.829928,0.092715,1360,8,2.771628,49559.5029,-0.301848,56.339076,31.0


### Adding a label for non coding sequences 

In [42]:
features_df["coding/noncoding"] = 0    #1 for non-coding sequences
features_df.head()

Unnamed: 0,Fickett Score,NucleiA,NucleiG,NucleiT,NucleiC,GC content,ORF Length,Relative Codon Bias,Isoelectric Potential,Aromaticity,Transcript Length,CpG Islands,SCF,MW,Gravy,Instability Index,Stop Frame,coding/noncoding
0,0.33022,0.192982,0.245009,0.253479,0.30853,0.553539,390,0.46601,8.557163,0.079855,1653,23,1.754161,58803.5325,0.013569,69.028104,7.0,0
1,0.33326,0.193527,0.241403,0.25354,0.311531,0.552933,402,0.428935,8.525573,0.05668,1483,21,2.46491,50958.2792,-0.037395,61.60105,57.0,0
2,0.3485,0.213608,0.299051,0.205696,0.281646,0.580696,180,0.591729,8.961508,0.057143,632,10,0.930165,21749.5998,-0.254902,72.012255,3.0,0
3,0.34765,0.218935,0.333333,0.163708,0.284024,0.617357,165,0.659118,11.371079,0.059172,507,18,0.321002,18519.8711,-0.794012,70.119162,9.0,0
4,0.34143,0.20033,0.281121,0.201979,0.31657,0.597692,165,0.529159,10.728845,0.059406,1213,34,1.689742,41313.5081,-0.344133,75.329872,3.0,0


In [43]:
features_df.shape

(173111, 18)

### To CSV

In [44]:
features_df.to_csv('noncoding_17_features.csv', index=False)