In [1]:
# Import packages
import pandas as pd
import numpy as np 
import requests 
import zipfile
import psycopg2
import sqlalchemy
import _functions_sql as fs
import _functions_data_files as fdf

In [3]:
# Import CSV file
source_dir = 'faostat_trade_matrix'
file_name = 'Trade_DetailedTradeMatrix_E_All_Data_(Normalized).csv'
# define converters for 'read_csv' & import file
conv = {
    'Note': str # handle DtypeWarning not using 'low_memory=False' (deprecated)
                # so missing values are now read as empty string ('')
}
trade_matrix = pd.read_csv(
      fdf.get_path(file_name, source_dir)
    , encoding='latin-1'
    , converters=conv
)

In [4]:
trade_matrix.head()

Unnamed: 0,Reporter Country Code,Reporter Country Code (M49),Reporter Countries,Partner Country Code,Partner Country Code (M49),Partner Countries,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,'004,Afghanistan,4,'012,Algeria,230,'21424,"Cashew nuts, shelled",5910,Export Quantity,2016,2016,t,3.0,X
1,2,'004,Afghanistan,4,'012,Algeria,230,'21424,"Cashew nuts, shelled",5922,Export Value,2016,2016,1000 USD,23.0,X
2,3,'008,Albania,3,'008,Albania,828,'25020.01,Cigarettes,5610,Import Quantity,2004,2004,t,12.0,A
3,3,'008,Albania,3,'008,Albania,828,'25020.01,Cigarettes,5622,Import Value,2004,2004,1000 USD,104.0,A
4,4,'012,Algeria,2,'004,Afghanistan,231,'21422,"Almonds, shelled",5610,Import Quantity,2005,2005,t,3.0,A


In [5]:
trade_matrix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49159659 entries, 0 to 49159658
Data columns (total 16 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Reporter Country Code        int64  
 1   Reporter Country Code (M49)  object 
 2   Reporter Countries           object 
 3   Partner Country Code         int64  
 4   Partner Country Code (M49)   object 
 5   Partner Countries            object 
 6   Item Code                    int64  
 7   Item Code (CPC)              object 
 8   Item                         object 
 9   Element Code                 int64  
 10  Element                      object 
 11  Year Code                    int64  
 12  Year                         int64  
 13  Unit                         object 
 14  Value                        float64
 15  Flag                         object 
dtypes: float64(1), int64(6), object(9)
memory usage: 5.9+ GB


In [27]:
# Check for full duplicates
trade_matrix.duplicated().value_counts()

False    49159659
Name: count, dtype: int64

In [6]:
# Drop unneeded columns
trade_matrix.drop(columns = ['Reporter Country Code', 'Reporter Country Code (M49)', 'Partner Country Code', 'Partner Country Code (M49)', 'Item Code (CPC)','Year Code', 'Element Code', 'Flag'], inplace=True)

In [7]:
# Rename remaining columns
trade_matrix.rename(columns = {'Reporter Countries':'reporting_country', 'Partner Countries': 'partner_country', 'Item Code':'item_code', 'Item':'item', 'Element':'element', 'Year':'year', 'Unit':'unit', 'Value':'value'}, inplace=True)

In [8]:
# Getting a list of unique values in the element column
trade_matrix['element'].unique()

array(['Export Quantity', 'Export Value', 'Import Quantity',
       'Import Value'], dtype=object)

In [9]:
# Getting a list of unique values in the unit column
trade_matrix['unit'].unique()

array(['t', '1000 USD', 'An', '1000 An', 'No'], dtype=object)

In [10]:
trade_matrix_new = trade_matrix.pivot_table(index=['reporting_country','partner_country','item_code', 'item', 'year', 'unit'],
                    columns='element', values='value', aggfunc='sum').reset_index()

In [64]:
trade_matrix_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41201826 entries, 0 to 41201825
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   reporting_country  object 
 1   partner_country    object 
 2   item_code          int64  
 3   item               object 
 4   year               int64  
 5   unit               object 
 6   export_quantity    float64
 7   export_value       float64
 8   import_quantity    float64
 9   import_value       float64
dtypes: float64(4), int64(2), object(4)
memory usage: 3.1+ GB


In [69]:
trade_matrix_new.head()

Unnamed: 0,reporting_country,partner_country,item_code,item,year,unit,export_quantity,export_value,import_quantity,import_value
0,Afghanistan,Algeria,230,"Cashew nuts, shelled",2016,1000 USD,,23000.0,,
1,Afghanistan,Algeria,230,"Cashew nuts, shelled",2016,t,3000.0,,,
2,Afghanistan,Algeria,561,Raisins,2014,1000 USD,,27000.0,,
3,Afghanistan,Algeria,561,Raisins,2014,t,12460.0,,,
4,Afghanistan,Algeria,723,"Other stimulant, spice and aromatic crops, n.e.c.",2014,1000 USD,,0.0,,


In [16]:
duplicates = trade_matrix_new[trade_matrix_new[['reporting_country', 'partner_country', 'year', 'item', 'unit','Export Quantity', 'Export Value']].duplicated(keep=False)]
duplicates.head()

element,reporting_country,partner_country,item_code,item,year,unit,Export Quantity,Export Value,Import Quantity,Import Value


In [66]:
# Rename new columns
trade_matrix_new.rename(columns = {'Export Quantity':'export_quantity', 'Export Value': 'export_value', 'Import Value':'import_value', 'Import Quantity':'import_quantity'}, inplace=True)

In [40]:
#trade_matrix_new.loc[:, 'export_value'] *= 1000

In [41]:
#trade_matrix_new.loc[:, 'import_value'] *= 1000

In [44]:
#trade_matrix_new.loc[:, 'export_quantity'] *= 1000

In [46]:
#trade_matrix_new.loc[:, 'import_quantity'] *= 1000

In [68]:
trade_matrix_new = trade_matrix_new.rename_axis(None, axis=1)

In [17]:
trade_matrix_new.to_csv("trade_matrix.csv", index = False, sep=',', encoding='latin-1')


In [71]:
# Export the table
engine = fs.get_engine()
schema = 'capstone_envirolytics'
table_name = 'fao_trade_matrix_new'

if engine!=None:
    try:
        trade_matrix_new.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schema that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

Can't reconnect until invalid transaction is rolled back. (Background on this error at: https://sqlalche.me/e/14/8s2b)
