In [1]:
import os
import pandas as pd
from pandas import Series, DataFrame

Data sources

- https://www.genome.gov/about-genomics/fact-sheets/DNA-Sequencing-Costs-Data

Get file location

In [2]:
## Get current working directory
print("Current working directory:", os.getcwd())

## Find path to raw data (for the raw files)
raw_data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'raw_data'))

## Find path to folder for saving cleaned csv
cleaned_data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'cleaned_data'))

## Find path to folder for inflation
inflation_data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'inflation'))

Current working directory: /Users/jennagreene/Documents/GitHub/HATCH_data/reading_files


In [3]:
# Find and read the CSV file from the raw_data folder
target_file = 'cost data/Cummulative Sum by Year_Full Data_data.xls'
target_file_path = os.path.join(raw_data_path, target_file)

dna = pd.read_excel(target_file_path, usecols=[0,1])
dna

Unnamed: 0,Date,Cost per Mb
0,2001-09-30,5292.392885
1,2002-03-31,3898.635412
2,2002-09-30,3413.801195
3,2003-03-31,2986.204671
4,2003-10-31,2230.975235
...,...,...
73,2021-05-31,0.005045
74,2021-08-31,0.006246
75,2021-11-30,0.006136
76,2022-02-28,0.005829


In [4]:
# Initialize an empty list to store extracted years
years = []

# Iterate through the rows of the DataFrame
for idx in range(len(dna)):
    # Extract the year from the 'Date' column and convert it to an integer
    date = int(str(dna.iloc[idx]['Date'])[:4])
    
    # Append the extracted year to the years list
    years.append(date)

# Add a new 'Year' column to the DataFrame containing the extracted years
dna['Year'] = years

# Drop the original 'Date' column from the DataFrame
dna.drop(columns=['Date'], inplace=True)

# Set the 'Year' column as the index of the DataFrame
dna.set_index('Year', drop=True, inplace=True)

# Group the DataFrame by year and calculate the mean for each group
dna = dna.groupby(['Year']).mean()

In [5]:
dna = dna.transpose()
dna['Data Source'] = 'NIH'
dna['Spatial Scale'] = 'Global'
dna['Country Code'] = 'World'
dna['Country Name'] = 'World'
dna['Metric'] = 'Price'
dna['Unit'] = 'USD/megabase'
dna['Technology Name'] = 'DNA Sequencing (All Methods)'
dna['ID'] = dna['Technology Name'] + '_' + dna['Metric'] + '_' + dna['Country Code']
dna.set_index('ID', drop=True, inplace=True)
dna.columns.name = None
dna

Unnamed: 0_level_0,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,...,2020,2021,2022,Data Source,Spatial Scale,Country Code,Country Name,Metric,Unit,Technology Name
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DNA Sequencing (All Methods)_Price_World,5292.392885,3656.218303,2608.589953,1217.730698,884.389129,642.334236,479.592436,32.330623,1.571799,0.384731,...,0.007074,0.00672,0.005829,NIH,Global,World,World,Price,USD/megabase,DNA Sequencing (All Methods)


Save file

In [6]:
output_file = 'dna_sequencing.csv'
output_file_path = os.path.join(cleaned_data_path, output_file)

dna.to_csv(output_file_path)
print("Data saved to:", output_file_path)


Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/cleaned_data/dna_sequencing.csv
