In [1]:
import pandas as pd
from os import chdir



In [2]:

### change directory - change to your DIR

directory =  r'../inputs'
chdir(directory)

### input file

gene_length_file = r'GeneCart24267_23-jan-2020.csv'
reference_file = r'bio_features.csv'

### chromosom length from NCBI - https://www.ncbi.nlm.nih.gov/genome/?term=Saccharomyces%20cerevisiae[Organism]&cmd=DetailsSearch

chromosome_length_dict = {'I': 0.23, 'II': 0.81, 'III': 0.32, 'IV':1.53, 'V':0.58, 'VI':0.27, 'VII':1.09, 'VIII':0.56,
                          'IX':0.44, 'X':0.75, 'XI':0.67, 'XII':1.08, 'XIII':0.92, 'XIV':0.78, 'XV':1.09, 'XVI':0.95}

### import file

df_gene_length = pd.read_csv(gene_length_file, usecols = ['Locus Tag', 'Gene Symbol', 'Chromosome', 'Start Coord', 'End Coord', 'Strand'])
df_gene_length = df_gene_length.rename(columns = {'Locus Tag':'ORF', 'Gene Symbol':'Gene', 'Chromosome':'chromosome', 'Start Coord':'start_coord', 'End Coord':'end_coord', 'Strand':'strand'})

df_reference = pd.read_csv(reference_file, usecols = ['ORF'])

df_gene_length_merged = df_gene_length.merge(df_reference, on = 'ORF')
df_gene_length_merged.loc[:, 'strand'] = df_gene_length_merged.loc[:, 'strand'].replace({'+':1, '-':-1})

### calculating the gene's length
df_gene_length_merged.loc[:, 'gene_length'] = df_gene_length.end_coord - df_gene_length.start_coord


In [3]:
df_gene_length_merged['chromosome_num'] = df_gene_length_merged['chromosome'].str.split(expand=True)[1]


In [4]:
df_gene_length_merged.loc[:, 'chromosome_length'] = df_gene_length_merged.loc[:, 'chromosome_num'].replace(chromosome_length_dict)*(10**6)
### calculating where are the start and the stop coordinates in relation to the chromosome length
df_gene_length_merged.loc[:, 'start_coord_perc'] = df_gene_length_merged['start_coord']/df_gene_length_merged['chromosome_length']
df_gene_length_merged.loc[:, 'end_coord_perc'] = df_gene_length_merged['end_coord']/df_gene_length_merged['chromosome_length']
df_gene_length_merged.loc[:, 'end_coord_perc_complement'] = 1-df_gene_length_merged['end_coord_perc']
df_gene_length_merged.loc[:, 'start_coord_perc_complement'] = 1-df_gene_length_merged['start_coord_perc']


df_gene_length_merged.loc[:, 'gene_distance_to_end_of_chromosome_near_end'] = df_gene_length_merged[['start_coord_perc','end_coord_perc_complement']].min(axis=1)
df_gene_length_merged.loc[:, 'gene_distance_to_end_of_chromosome_far_end'] = df_gene_length_merged[['end_coord_perc','start_coord_perc_complement']].min(axis=1)

df_gene_length_merged[['ORF', 'gene_length', 'gene_distance_to_end_of_chromosome_near_end', 'gene_distance_to_end_of_chromosome_far_end']].to_csv('../inputs/length_and_loc_features.csv')


