<a href="https://colab.research.google.com/github/jmelendezgeo/Exploratory-analysis-/blob/main/ImportExportColombia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Note: This notebooks uses datasets saved on my repository. This data is availafble on source pages.

In this notebook we will work mainly with a historical record of imports and exports of Colombia and we will associate it with the respective products 

- Data sets load
- Cleaning, joining and preparation
- Data Visualization
- Storytelling and insights

# Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bokeh.io import output_notebook

# Data Load

In [3]:
# COLOMBIA EXPORTS
url = 'https://raw.githubusercontent.com/jmelendezgeo/Data-Triathlon/main/colombia_exports.csv'
colombia_exports = pd.read_csv(url, sep = '|')
# COLOMBIA IMPORTS
url = 'https://raw.githubusercontent.com/jmelendezgeo/Data-Triathlon/main/colombia_imports.csv'
colombia_imports = pd.read_csv(url, sep = '|')
# COUNTRY NAMES
url = 'https://raw.githubusercontent.com/jmelendezgeo/Data-Triathlon/main/country_names.csv'
country_names = pd.read_csv(url, sep = '|')
# GROUP SITC
url = 'https://raw.githubusercontent.com/jmelendezgeo/Data-Triathlon/main/groups_sitc_rev2.csv'
groups_sitc = pd.read_csv(url, sep= '|')
# PRODUCTS SITC
url = 'https://raw.githubusercontent.com/jmelendezgeo/Data-Triathlon/main/products_sitc_rev2.csv'
products_sitc = pd.read_csv(url, sep = '|')



# Basic dataset info
**Exports**
```
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 389596 entries, 0 to 389595
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    389596 non-null  int64  
 1   Unnamed: 0.1  389596 non-null  int64  
 2   year          389596 non-null  int64  
 3   origin        389596 non-null  object 
 4   dest          389596 non-null  object 
 5   sitc4         389596 non-null  int64  
 6   export_val    389596 non-null  float64
dtypes: float64(1), int64(4), object(2)
memory usage: 20.8+ MB
```
**Imports**

```
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560492 entries, 0 to 560491
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    560492 non-null  int64  
 1   Unnamed: 0.1  560492 non-null  int64  
 2   year          560492 non-null  int64  
 3   origin        560492 non-null  object 
 4   dest          560492 non-null  object 
 5   sitc4         560492 non-null  int64  
 6   export_val    560492 non-null  float64
dtypes: float64(1), int64(4), object(2)
memory usage: 29.9+ MB
```
**Products**
```
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 988 entries, 0 to 987
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  988 non-null    int64 
 1   id          988 non-null    object
 2   sitc        988 non-null    int64 
 3   name        988 non-null    object
dtypes: int64(2), object(2)
memory usage: 31.0+ KB
```
**Groups**
```
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65 entries, 0 to 64
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  65 non-null     int64 
 1   id          65 non-null     int64 
 2   category    65 non-null     object
dtypes: int64(2), object(1)
memory usage: 1.6+ KB
```

**Countries**

```
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  263 non-null    int64 
 1   id          263 non-null    object
 2   id_3char    263 non-null    object
 3   name        263 non-null    object
dtypes: int64(1), object(3)
memory usage: 8.3+ KB
```














In [None]:
def create_continent(acr):
  """This function receives the acronym of a country 
  and relates the first two letters with the respective continents. 
  Returns the name of the continent  """

  if acr[0:2] == 'af':
    return 'Africa'
  elif acr[0:2] == 'as':
    return 'Asia'
  elif acr[0:2] == 'eu':
    return 'Europe'
  elif acr[0:2] == 'na':
    return 'North America'
  elif acr[0:2] == 'oc':
    return 'Oceania'
  elif acr[0:2] == 'sa':
    return 'South America'
  else:
    return 'Other'

In [None]:
# We want to drop unnecessary columns 
columns_to_keep = ['year','origin','dest','sitc4','export_val']
colombia_exports = colombia_exports[columns_to_keep]
colombia_exports=colombia_exports.rename(columns={'sitc4':'sitc'})
colombia_imports = colombia_imports[columns_to_keep]
colombia_imports = colombia_imports.rename(columns={'sitc4':'sitc', 'export_val':'import_val'})
country_names = country_names[['id','id_3char','name']]
groups_sitc = groups_sitc[['id','category']]
groups_sitc['id'] = groups_sitc['id'].astype(str).str.zfill(2) # group code has 2 characters
products_sitc = products_sitc[['id','sitc','name']]

In [None]:
# Joining information 
# -------------------------- EXPORTS DF-------------------------------------
exports_df = pd.merge(colombia_exports,country_names, how = 'inner', left_on = 'dest', right_on = 'id_3char').drop(columns='id_3char')
exports_df['to continent'] = exports_df['id'].apply(lambda x : create_continent(x))
exports_df = (exports_df
              .rename(columns={'name':'country name'})
              .drop(columns='id'))
exports_df = (pd.merge(exports_df,products_sitc[['sitc','name']], how='inner', on = 'sitc' )
                .rename(columns={'name':'product name'}))
exports_df['sitc'] = exports_df['sitc'].astype(str).str.zfill(4) # Now we need 4 characters in sitc
exports_df['sitc']=exports_df['sitc'].str.extract(r'(^\d{2})') # The first 2 characters are the group sitc code 
exports_df = (pd.merge(exports_df,groups_sitc,how='inner',left_on='sitc',right_on='id')
                .drop(columns=['sitc','id']))

# --------------------------- IMPORTS DF ---------------------------------------
imports_df = pd.merge(colombia_imports,country_names, how = 'inner', left_on = 'dest', right_on = 'id_3char').drop(columns='id_3char')
imports_df['from continent'] = imports_df['id'].apply(lambda x : create_continent(x))
imports_df = (imports_df
              .rename(columns={'name':'country name'})
              .drop(columns='id'))
imports_df = (pd.merge(imports_df,products_sitc[['sitc','name']], how='inner', on = 'sitc' )
                .rename(columns={'name':'product name'}))
imports_df['sitc'] = imports_df['sitc'].astype(str).str.zfill(4) # Now we need 4 characters in sitc
imports_df['sitc']=imports_df['sitc'].str.extract(r'(^\d{2})') # The first 2 characters are the group sitc code 
imports_df = (pd.merge(imports_df,groups_sitc,how='inner',left_on='sitc',right_on='id')
                .drop(columns=['sitc','id']))



In [None]:
exports_df.groupby('to continent').count()['year'].sort_values(ascending=False)