In [1]:
import pandas as pd
from pandas import Series, DataFrame
import country_converter as coco
import os

Get current and desired file paths

In [2]:
## Get current working directory
print("Current working directory:", os.getcwd())

## Find path to raw data (for the raw files)
raw_data_path_pcdb= os.path.abspath(os.path.join(os.getcwd(), '..', 'raw_data/pcdb'))

## Find path to folder for saving cleaned csv
cleaned_data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'cleaned_data'))


## Find path to folder for inflation
inflation_data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'inflation'))

Current working directory: /Users/jennagreene/Documents/GitHub/HATCH_data/reading_files


In [3]:

target_inflation_file = "/A001RG3A086NBEA.xls"
target_inflation_filepath = inflation_data_path + target_inflation_file

#target_file = '/UndergroundCables_Data.xlsx'
#target_filepath = raw_data_path + target_file

In [4]:
## adjusting for inflation from 1966 USD to 2022 USD
nipa = pd.read_excel(target_inflation_filepath, header=10)

year_list = []
for x in nipa['observation_date']:
    x = int(str(x)[:4])
    year_list.append(x)
nipa['Year'] = year_list
nipa.set_index('Year', drop=True, inplace=True)
nipa.drop(columns='observation_date', inplace=True)
nipa = nipa.transpose()

infl_1966_2022 = float(nipa[2022]/nipa[1966])
print(infl_1966_2022)

infl_1958_2022 = float(nipa[2022]/nipa[1958])
print(infl_1958_2022)

6.968934911242604
7.873537604456824


  infl_1966_2022 = float(nipa[2022]/nipa[1966])
  infl_1958_2022 = float(nipa[2022]/nipa[1958])


In [5]:
def read_pcdb(file, tech_name, place='World'):
    # Construct the full file path
    file_name = raw_data_path_pcdb + "/" + file
    
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_name, usecols=[0, 1, 2])
    
    # Check for non-numeric values in the DataFrame columns
    for col in df.columns:
        for val in df[col]:
            if type(val) != float and type(val) != int:
                print(type(val), val)
                break
    
    # Set the index to the second column and transpose the DataFrame
    df.set_index(df.columns[1], inplace=True)
    df = df.transpose()
    
    # Adjust values for inflation
    for idx in df.index:
        if '1966' in idx:
            df.loc[idx] = df.loc[idx] * infl_1966_2022
        if '1958' in idx:
            df.loc[idx] = df.loc[idx] * infl_1958_2022
    
    # Add metadata columns based on the specified place
    df['Data Source'] = 'PCDB'
    df['Metric'] = ['Price', 'Annual production']
    df['Technology Name'] = tech_name
    if place=='World':
        df['Spatial Scale'] = 'Global'
        df['Country Name'] = 'World'
        df['Country Code'] = 'World'
    elif place=='US':
        df['Spatial Scale'] = 'National'
        df['Country Name'] = 'United States'
        df['Country Code'] = 'US'
    elif place=='Japan':
        df['Spatial Scale'] = 'National'
        df['Country Name'] = 'Japan'
        df['Country Code'] = 'JP'
    
    # Set the ID column and define the units
    df['ID'] = df['Technology Name'] + '_' + df['Metric'] + '_' + df['Country Code']
    df['Unit'] = [df.index[0], df.index[1]]
    df.set_index('ID', drop=True, inplace=True)
    df.columns.name = None
    
    # Rename and standardize column names
    df.replace({'Yearly Production (Mil. lbs)':'million pounds', 'Price (1966 USD/lbs)':'2022 USD/lb',
               'Yearly  Production (Billion Pounds)': 'billion pounds', 
                'Yearly Production (Million Gallons)': 'million gallons', 'Price (1958 USD)':'2022 USD',
               'Price (1958 USD/lbs)':'2022 USD/lb','Price per Gallon (1958 USD)':'2022 USD/gal',
               'Price per Pound (1958 USD)':'2022 USD/lb','Quoted Price per Pound (1958 USD)':'2022 USD/lb',
               'Yearly Production (Net Tons)':'net tons','Yearly Production (Billion Barrels)':'billion barrels',
               'Yearly Production (Millon Pounds)':'million pounds',
                'Yearly Production (Million Pounds)':'million pounds',
                'Yearly Production (Short Tons of Primary Production)':'short tons', 
                'Cost (USD/Kilobase)':'USD/kilobase','Yearly Production (Gbase)':'Gbase'}, 
               inplace=True)

    # Define the file path for the cleaned data
    file_path = cleaned_data_path + "/" + tech_name + '.csv'
    
    # Save the DataFrame to a CSV file
    df.to_csv(file_path)
    
    return df

Acrylic Fiber

In [6]:
# https://pcdb.santafe.edu/graph.php?curve=119
acrylic_fiber = read_pcdb('AcrylicFiber.csv', 'Acrylic Fiber')

Acrylonitrile

In [7]:
# https://pcdb.santafe.edu/graph.php?curve=113
acrylonitrile = read_pcdb('Acrylonitrile.csv', 'Acrylonitrile')

Aniline

In [8]:
# https://pcdb.santafe.edu/graph.php?curve=108
aniline = read_pcdb('Aniline.csv', 'Aniline')

Benzene

In [9]:
# https://pcdb.santafe.edu/graph.php?curve=100
benzene = read_pcdb('Benzene.csv', 'Benzene')

Bisphenol A

In [10]:
# https://pcdb.santafe.edu/graph.php?curve=111
bisphenol_a = read_pcdb('BisphenolA.csv', 'BisphenolA')

Caprolactam

In [11]:
# https://pcdb.santafe.edu/graph.php?curve=112
caprolactam = read_pcdb('Caprolactam.csv', 'Caprolactam')

Crude Oil

In [12]:
# https://pcdb.santafe.edu/graph.php?curve=97
# drop price, BP has longer time series
# drop altogether due to duplication from Mitchell
# crude_oil = read_pcdb('Crude_Oil.csv', 'Crude Oil')
# crude_oil.drop(index='Crude Oil_Price_World', inplace=True)
# crude_oil.to_csv('cleaned data/pcdb_Crude_Oil.csv')

Cyclohexane

In [13]:
# https://pcdb.santafe.edu/graph.php?curve=116
cyclohexane = read_pcdb('Cyclohexane.csv', 'Cyclohexane')

Ethanolamine

In [14]:
# https://pcdb.santafe.edu/graph.php?curve=118
ethanolamine = read_pcdb('Ethanolamine.csv', 'Ethanolamine')

Ethyl Alcohol

In [15]:
# https://pcdb.santafe.edu/graph.php?curve=120
ethyl_alcohol = read_pcdb('EthylAlcohol.csv', 'Ethanol')

Ethylene

In [16]:
# https://pcdb.santafe.edu/graph.php?curve=99
ethylene = read_pcdb('Ethylene.csv', 'Ethylene')

Ethylene Glycol

In [17]:
# https://pcdb.santafe.edu/graph.php?curve=122
# EthyleneGlycol
ethylene_glycol = read_pcdb('EthyleneGlycol.csv', 'Ethylene Glycol')

Formaldehyde

In [18]:
# https://pcdb.santafe.edu/graph.php?curve=123
formaldehyde = read_pcdb('Formaldehyde.csv', 'Formaldehyde')

Hydroflouric Acid


In [19]:
# https://pcdb.santafe.edu/graph.php?curve=124
hydrofluoric_acid = read_pcdb('HydrofluoricAcid.csv', 'Hydrofluoric Acid')

Magnesium

In [20]:
# https://pcdb.santafe.edu/graph.php?curve=126
magnesium = read_pcdb('Magnesium.csv', 'Magnesium')

Maleic Anhydride

In [21]:
# https://pcdb.santafe.edu/graph.php?curve=127
maleic_anhydride = read_pcdb('MaleicAnhydride.csv', 'Maleic Anhydride')

Methanol

In [22]:
# https://pcdb.santafe.edu/graph.php?curve=128
methanol = read_pcdb('Methanol.csv', 'Methanol')

Motor Gasoline

In [23]:
# https://pcdb.santafe.edu/graph.php?curve=98
motor_gasoline = read_pcdb('Motor_Gasoline.csv', 'Motor Gasoline')

Neoprene Rubber

In [24]:
# https://pcdb.santafe.edu/graph.php?curve=129
neoprene_rubber = read_pcdb('NeopreneRubber.csv', 'Neoprene Rubber')

Praxylene

In [25]:
# https://pcdb.santafe.edu/graph.php?curve=101
paraxylene = read_pcdb('Paraxylene.csv', 'Paraxylene')

Pentaerythritol

In [26]:
# https://pcdb.santafe.edu/graph.php?curve=130
pentaerythritol = read_pcdb('Pentaerythritol.csv', 'Pentaerythritol')

Phenol

In [27]:
# https://pcdb.santafe.edu/graph.php?curve=131
phenol = read_pcdb('Phenol.csv', 'Phenol')

Phthalic Anhydride

In [28]:
# https://pcdb.santafe.edu/graph.php?curve=132
phthalic_anhydride = read_pcdb('PhthalicAnhydride.csv', 'Phthalic Anhydride')

Polyester Fiber

In [29]:
# https://pcdb.santafe.edu/graph.php?curve=133
polyester_fiber = read_pcdb('PolyesterFiber.csv', 'Polyester Fiber')

Polyethylene HD

In [30]:
# https://pcdb.santafe.edu/graph.php?curve=134
polyethylene_hd = read_pcdb('PolyethyleneHD.csv', 'High-Density Polyethylene')

Polyethylene LD

In [31]:
# https://pcdb.santafe.edu/graph.php?curve=135
polyethylene_ld = read_pcdb('PolyethyleneLD.csv', 'Low-Density Polyethylene')

Polystyrene

In [32]:
# https://pcdb.santafe.edu/graph.php?curve=147
polystyrene = read_pcdb('Polystyrene.csv', 'Polystyrene')

Polyvinylchloride

In [33]:
# https://pcdb.santafe.edu/graph.php?curve=148
polyvinylchloride = read_pcdb('Polyvinylchloride.csv', 'Polyvinylchloride')

Primary Magnesium

In [34]:
# https://pcdb.santafe.edu/graph.php?curve=150
primary_magnesium = read_pcdb('Primary_Magnesium.csv', 'Primary Magnesium')

Capillary DNA

In [35]:
# https://pcdb.santafe.edu/graph.php?curve=30
capillary_dna = read_pcdb('Capillary_DNA_Sequencing.csv', 'Capillary DNA Sequencing')

Shotgun Sanger DNA Sequencing

In [36]:
# https://pcdb.santafe.edu/graph.php?curve=34
# Shotgun_Sanger_DNA_Sequencing

# due to the uneven structure of the data,
# we are using the average cost of all quarters where data is provided as the yearly average cost
# and the yearly production value of the latest quarter in the year

dna_sequencing = pd.read_csv((raw_data_path_pcdb + '/Shotgun_Sanger_DNA_Sequencing.csv'), usecols=[0, 1, 2])
year_list = []
for year in dna_sequencing['Time (Year)']:
    year_list.append(str(year)[:4])
dna_sequencing['Year'] = year_list

year_set = set(year_list)
year_dict = {}

for year in year_set:
    year_dict[year] = []
    
year_dict = dict(sorted(year_dict.items(), key = lambda item: item[0]))

for year in dna_sequencing['Time (Year)']:
    year_4_char = str(year)[:4]
    year = str(year)
    year_dict[year_4_char].append(year)

greatest_qtrs = []
for year in year_dict:
    greatest_qtr = None
    for qtr in year_dict[year]:
        if greatest_qtr == None:
            greatest_qtr = qtr
        else:
            if qtr > greatest_qtr:
                greatest_qtr = qtr
    greatest_qtrs.append(qtr)

yearly_production = []
for idx in dna_sequencing.index:
    qtr_year = str(dna_sequencing['Time (Year)'].iloc[idx])
    yrly_production = dna_sequencing['Yearly Production (Kilobase)'].iloc[idx]
    if qtr_year in greatest_qtrs:
        yearly_production.append(yrly_production)
        
year_list = sorted(list(year_set))

cost_dict = {}
for year in year_set:
    cost_dict[year] = []
cost_dict = dict(sorted(cost_dict.items(), key = lambda item: item[0]))

for idx in dna_sequencing.index:
    cost = dna_sequencing['Cost (USD/Kilobase)'].iloc[idx]
    year = str(dna_sequencing['Year'].iloc[idx])
    cost_dict[year].append(cost)

avg_cost = []
for year in cost_dict:
    avg_cost.append(sum(cost_dict[year]) / len(cost_dict[year]))
    
new_df = pd.DataFrame(avg_cost, year_list)
new_df['Annual Production'] = yearly_production
new_df = new_df.transpose()
new_df['Data Source'] = 'Santa Fe Institute'
new_df['Spatial Scale'] = 'Global'
new_df['Country Name'] = 'World'
new_df['Country Code'] = 'World'
new_df['Metric'] = ['Price', 'Total Number']
new_df['Unit'] = ['USD/kilobase', 'kilobase']
new_df['Technology Name'] = 'Shotgun Sanger DNA Sequencing'
new_df['ID'] = new_df['Technology Name'] + '_' + new_df['Metric'] + '_' + new_df['Country Code']
new_df.set_index('ID', drop=True, inplace=True)

# Define the file path
file_path = cleaned_data_path + "/pcdb_Shotgun_Sanger_DNA_Sequencing.csv"
new_df.to_csv(file_path)   


Sodium 

In [37]:
# https://pcdb.santafe.edu/graph.php?curve=136
sodium = read_pcdb('Sodium.csv', 'Sodium')

Sodium Chlorate

In [38]:
# https://pcdb.santafe.edu/graph.php?curve=137
sodium_chlorate = read_pcdb('SodiumChlorate.csv', 'Sodium Chlorate')

Styrene

In [39]:
# https://pcdb.santafe.edu/graph.php?curve=140
styrene = read_pcdb('Styrene.csv', 'Styrene')

Titanium Sponge

In [40]:
# https://pcdb.santafe.edu/graph.php?curve=151
titanium_sponge = read_pcdb('Titanium_Sponge.csv', 'Titanium Sponge')

Urea

In [41]:
# https://pcdb.santafe.edu/graph.php?curve=143
urea = read_pcdb('Urea.csv', 'Urea')

Vinyl Acetate

In [42]:
# https://pcdb.santafe.edu/graph.php?curve=144
vinyl_acetate = read_pcdb('VinylAcetate.csv', 'Vinyl Acetate')

Vinyl Chloride

In [43]:
# https://pcdb.santafe.edu/graph.php?curve=145
vinyl_chloride = read_pcdb('VinylChloride.csv', 'Vinyl Chloride')