In [1]:
import os
import pandas as pd
from pandas import Series, DataFrame
from openpyxl import load_workbook
import country_converter as coco

Reading files

In [2]:
## Get current working directory
print("Current working directory:", os.getcwd())

## Find path to raw data (for the raw files)
raw_data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'raw_data'))

## Find path to folder for saving cleaned csv
cleaned_data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'cleaned_data'))

Current working directory: /Users/jennagreene/Documents/GitHub/HATCH_data/reading_files


In [3]:
mitchell_folder = "/Mitchell_InternationalHistoricalStatistics"
mitchell_filepath = raw_data_path + mitchell_folder

In [4]:
mitchell_files = os.listdir(mitchell_filepath)

reading_files = []
for file in mitchell_files:
     if '.xlsx' in file or '.xls' in file:
        reading_files.append(file)
reading_files.sort()
reading_files

['Acids_Africa.xlsx',
 'Acids_Americas.xlsx',
 'Acids_Asia.xlsx',
 'Acids_Europe.xlsx',
 'Acids_Oceania.xlsx',
 'Beer_Africa.xls',
 'Beer_Asia.xls',
 'Beer_Europe.xls',
 'Beer_North America.xls',
 'Beer_Oceania.xls',
 'Beer_South America.xls',
 'Crude Petroleum_Africa.xls',
 'Crude Petroleum_Asia.xlsx',
 'Crude Petroleum_Europe.xls',
 'Crude Petroleum_North America.xls',
 'Crude Petroleum_Oceania.xls',
 'Crude Petroleum_South America.xls',
 'GoldSilver_Africa.xlsx',
 'GoldSilver_Asia.xlsx',
 'GoldSilver_Oceania.xlsx',
 'Milk_Africa.xls',
 'Milk_Asia.xls',
 'Milk_Europe.xls',
 'Milk_North America.xls',
 'Milk_Oceania.xls',
 'Milk_South America.xls',
 'Nickel_All.xlsx',
 'Refining_Africa.xlsx',
 'Refining_Asia.xlsx',
 'Refining_NorthAmerica.xlsx',
 'Refining_Oceania.xlsx',
 'Refining_SouthAmerica.xlsx',
 'Sugar_Africa.xls',
 'Sugar_Asia.xls',
 'Sugar_Europe.xls',
 'Sugar_North America.xls',
 'SyntheticFilaments_Americas.xlsx',
 'SyntheticFilaments_AsiaAfricaOceania.xlsx',
 'SyntheticFila

In [5]:
def read_mitchell(file_relpath):
    # Construct the full file path
    file_name = mitchell_filepath + "/" +file_relpath
    print(file_name)
    # Read the Excel file and get sheet names
    df = pd.ExcelFile(file_name)
    df_sheets = df.sheet_names
    
    # Sheets to drop from processing
    drop_sheets = ['Sheet1', 'Sheet2', 'Sheet3', 'Sheet7', 'All', 'Copy All', 'All Copy']
    
    # Remove unwanted sheets
    for item in drop_sheets:
        if item in df_sheets:
            df_sheets.remove(item)
    
    # List to store DataFrames
    df_list = [] 
    
    # Iterate over remaining sheets
    for sheet in df_sheets:

        # Read data from sheet, specifying columns and handling missing values
        df = pd.read_excel(file_name, sheet_name=sheet, usecols=[0,2,3], na_values = ['—', '...', '- -','…'])
        
        # Drop rows with missing values in the second column
        df.dropna(subset=df.columns[1], inplace=True)
        
        # Convert the second column to integers and set it as the index
        df[df.columns[1]] = df[df.columns[1]].astype(int)
        df.set_index(df.columns[1], inplace=True)
        
        # Extract country name and technology details
        country_name = df[df.columns[0]].iloc[0]
        technology_unit = df.columns[-1]
        cutoff = technology_unit.index('(')
        technology_name = technology_unit[:cutoff].strip()
        
        # Skip specific technologies
        if technology_name == 'Copper' or technology_name == 'Lead':
            continue
        
        # Extract and format unit
        unit = technology_unit[cutoff:]
        unit = unit.strip('()').title().lower()
        
        # Drop the first column, transpose DataFrame, and adjust column names
        df.drop(columns=df.columns[0], inplace=True)
        df = df.transpose()
        df.columns.name = None
        
        # Add metadata columns
        df['Unit'] = unit
        df['Technology Name'] = technology_name
        df['Country Name'] = country_name
        
        # Map country names to ISO codes
        country_code = ''
        if country_name in ['Phillipines', 'Phillipine']:
            country_code = 'PH'
        elif country_name == 'Yugoslavia':
            country_code = 'YU'
        elif country_name == 'East Germany':
            country_code = 'DEU'
        elif country_name == 'West Germany':
            country_code = 'DD'
        elif country_name == 'UAE':
            country_code = 'AE'
        elif country_name == 'Southern Vietnam':
            country_code = 'VNM'
        elif country_name == 'Kyrgistan':
            country_code = 'KG'
        elif country_name == 'USSR':
            country_code = 'SU'
        elif country_name == 'Czechoslovakia':
            country_code = 'CSK'
        elif country_name == 'Korea':
            country_code = 'KOR'
        else:
            country_code = coco.convert(names=country_name, to='iso2')
        df['Country Code'] = country_code
        df['Spatial Scale'] = 'National'
        df['Data Source'] = 'Mitchell'
        df['Metric'] = 'Annual Production'
        df['ID'] = df['Technology Name'] + '_' + df['Metric'] + '_' + df['Country Code']
        df.set_index('ID', drop=True, inplace=True)
        raw_mitchell_path = raw_data_path + '/mitchell_processing'
        # Save processed DataFrame to a CSV file
        output_file_name = sheet + '.csv'
        output_file_path = os.path.join(raw_mitchell_path, output_file_name)
        df.to_csv(output_file_path)

        print("Data saved to:", output_file_path)



In [6]:

for file in reading_files:
    read_mitchell(file)

/Users/jennagreene/Documents/GitHub/HATCH_data/raw_data/Mitchell_InternationalHistoricalStatistics/Acids_Africa.xlsx
Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/raw_data/mitchell_processing/SulphuricAcid_Algeria.csv
Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/raw_data/mitchell_processing/SulphuricAcid_Egypt.csv
Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/raw_data/mitchell_processing/SulphuricAcid_Morocco.csv
Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/raw_data/mitchell_processing/SulphuricAcid_Tunisia.csv
Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/raw_data/mitchell_processing/SulphuricAcid_Zaire.csv
Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/raw_data/mitchell_processing/SulphuricAcid_Zambia.csv
Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/raw_data/mitchell_processing/HydrochloricAcid_Algeria.csv
Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/raw_

In [7]:
raw_mitchell_path = raw_data_path + '/mitchell_processing'

mitchell_processing = os.listdir(raw_mitchell_path)
len(mitchell_processing)

492

In [8]:
for file in mitchell_processing:
    if '.csv' not in file:
        mitchell_processing.remove(file)

In [9]:
len(mitchell_processing)

492

In [10]:
frame_path = cleaned_data_path + '/frame.csv'
frame = pd.read_csv(frame_path)

In [11]:
# don't use default na, this erroneously converts 'NA' (iso code for Namibia) to a missing value
# I manually input all the default na vals except for 'NA' to prevent this
# https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
df_list = [frame]
for file in mitchell_processing:
    file = raw_mitchell_path + "/" + file
    file = pd.read_csv(file, keep_default_na=False, na_values=['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', 
                                                               '#N/A N/A', '#N/A', 'N/A', 'n/a', '<NA>', '#NA', 
                                                               'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', 
                                                               'None', '','—', '...', '- -','…','···','··· ',
                                                              '--', '... ', '… '])
    df_list.append(file)

In [12]:
all_mitchell = pd.concat(df_list)
all_mitchell.replace({'Caustic Soda':'Caustic Soda Acid', 'Synthetic Filaments':'Artificial and Synthetic Fibers',
                      'Beer': 'Beer Production',
                      "in thousands of hectolitres": "thousand hectolitres",
                     "in thousand hectolitres": "thousand hectolitres",
                     "thousands metric tons": "thousand metric tons",
                     'in million tons':'million tons','in thousand metric tons':'thousand metric tons',
                       'in millions of metric tons':'million metric tons',
                      'in thousands of metric tons':'thousand metric tons','in thousands of tons':'thousand tons'},
                     inplace=True)
all_mitchell


  all_mitchell = pd.concat(df_list)


Unnamed: 0,ID,Spatial Scale,Country Code,Country Name,Technology Name,Metric,Unit,Data Source,1700,1701,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
0,Tin_Annual Production_ID,National,ID,Indonesia,Tin,Annual Production,thousand metric tons,Mitchell,,,...,,,,,,,,,,
0,Sulphuric Acid_Annual Production_CA,National,CA,Canada,Sulphuric Acid,Annual Production,thousand metric tons,Mitchell,,,...,,,,,,,,,,
0,Beer_Annual Production_TM,National,TM,Turkmenistan,Beer Production,Annual Production,thousand hectolitres,Mitchell,,,...,,,,,,,,,,
0,Crude Petroleum_Annual Production_MM,National,MM,Myanmar,Crude Petroleum,Annual Production,thousand metric tons,Mitchell,,,...,,,,,,,,,,
0,Beer_Annual Production_CL,National,CL,Chile,Beer Production,Annual Production,thousand hectolitres,Mitchell,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Beer_Annual Production_BO,National,BO,Bolivia,Beer Production,Annual Production,thousand hectolitres,Mitchell,,,...,,,,,,,,,,
0,Sugar Output_Annual Production_MX,National,MX,Mexico,Sugar Output,Annual Production,thousand metric tons,Mitchell,,,...,,,,,,,,,,
0,Milk_Annual Production_SY,National,SY,Syria,Milk,Annual Production,thousand metric tons,Mitchell,,,...,,,,,,,,,,
0,Milk_Annual Production_IR,National,IR,Iran,Milk,Annual Production,thousand metric tons,Mitchell,,,...,,,,,,,,,,


In [13]:
all_mitchell[all_mitchell['Technology Name']=='Caustic Soda Acid']

Unnamed: 0,ID,Spatial Scale,Country Code,Country Name,Technology Name,Metric,Unit,Data Source,1700,1701,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
0,Caustic Soda_Annual Production_AU,National,AU,Australia,Caustic Soda Acid,Annual Production,thousand metric tons,Mitchell,,,...,,,,,,,,,,
0,Caustic Soda_Annual Production_PK,National,PK,Pakistan,Caustic Soda Acid,Annual Production,thousand metric tons,Mitchell,,,...,,,,,,,,,,
0,Caustic Soda_Annual Production_IN,National,IN,India,Caustic Soda Acid,Annual Production,thousand metric tons,Mitchell,,,...,,,,,,,,,,
0,Caustic Soda_Annual Production_AR,National,AR,Argentina,Caustic Soda Acid,Annual Production,thousand metric tons,Mitchell,,,...,,,,,,,,,,
0,Caustic Soda_Annual Production_BR,National,BR,Brazil,Caustic Soda Acid,Annual Production,thousand metric tons,Mitchell,,,...,,,,,,,,,,
0,Caustic Soda_Annual Production_JP,National,JP,Japan,Caustic Soda Acid,Annual Production,thousand metric tons,Mitchell,,,...,,,,,,,,,,
0,Caustic Soda Acid_Annual Production_US,National,US,United States of America,Caustic Soda Acid,Annual Production,thousand metric tons,Mitchell,,,...,,,,,,,,,,
0,Caustic Soda_Annual Production_EG,National,EG,Egypt,Caustic Soda Acid,Annual Production,thousand metric tons,Mitchell,,,...,,,,,,,,,,
0,Caustic Soda_Annual Production_TR,National,TR,Turkey,Caustic Soda Acid,Annual Production,thousand metric tons,Mitchell,,,...,,,,,,,,,,
0,Caustic Soda_Annual Production_CN,National,CN,China,Caustic Soda Acid,Annual Production,thousand metric tons,Mitchell,,,...,,,,,,,,,,


In [14]:
set(all_mitchell['Unit'])

{'metric tons',
 'million metric tons',
 'million tons',
 'thousand hectolitres',
 'thousand metric tons',
 'thousand tons'}

In [15]:

output_file = 'mitchell.csv'
output_file_path = os.path.join(cleaned_data_path, output_file)

all_mitchell.to_csv(output_file_path, index = False)
print("Data saved to:", output_file_path)


Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/cleaned_data/mitchell.csv
