In [None]:
# Librairies
import numpy as np # Matrix calculations
import pandas as pd # Data structures
import matplotlib.pyplot as plt # Graphics
import re # regular expressions

In [None]:
"""
value: The conversion rate is calculated using the average for the year in question, which has
been supplied by Oanda.

"""

In [None]:
# Load the weapons codes
path = "./prio_weapons_code.txt"
weapons_df = pd.read_csv(path, sep=';')

# Print nbr of rows
print("Nbr of rows : " + str(len(weapons_df.index)))

In [None]:
# Path to dataset
path = "./arms_trades_imports.csv"
header = 'Reporter_Code,Reporter_Name,Partner_Code,Partner_Name,ImportOrExport,Year,Period_Start,Period_End,Weapons_Type,Units,Value,Currency,Licenses_Issued,Weight,Licenses_Refused,AuthOrDel,GovtOrInd,Data_Source,Reliability,Accuracy,SmallArmsOnly,Comment,GlobalComment'

# We import the data
raw_df = pd.read_csv(path, sep=',', usecols = ['Reporter_Code','Partner_Code','ImportOrExport','Year','Weapons_Type','Value','Reliability','Accuracy'])

# Print nbr of rows
print("Nbr of rows : " + str(len(raw_df.index)))

In [None]:
def formatYear(val):
    
    if(isinstance(val, list) and len(val) == 2 and len(val[1]) == 2):
    
        # Check diff between the two years
        diff = int(val[1]) - int(val[0][2:4])

        if(diff > 1):
            print("NaN")
            return np.nan
        else:

            # Concat millenia
            datum = str(val[0][0:2]) + str(val[1])

            if(val[0] == '1999'):
                datum = '20' + str(val[1])

            return [val[0],datum]
    else:
        return val

In [None]:
# Transform Columns
raw_df['ImportOrExport'] = raw_df['ImportOrExport'].map({'Import':0, 'Export':1})
raw_df['Accuracy'] = raw_df['Accuracy'].map({'Low':0, 'Medium':1, 'High':2})

raw_df['Reliability'] = raw_df['Reliability'].str.split("/").str[0]
raw_df['Reliability'] = raw_df['Reliability'].map({'Pri':0, 'Sec':1})

raw_df['Year'] = raw_df['Year'].str.split("-")
raw_df['Year'] = raw_df['Year'].apply(formatYear)

raw_df['Weapons_Type'] = raw_df['Weapons_Type'].str.lower()

# Print nbr of rows
print("Nbr of rows : " + str(len(raw_df.index)))

In [None]:
def is_float(x):
    try:
        float(x)
    except ValueError:
        return False
    return True

In [None]:
# Only relevant columns
clean_df = raw_df[['Reporter_Code', 'Partner_Code', 'ImportOrExport', 'Value', 'Year']]

# Remove rows with NaN
clean_df = clean_df.dropna(subset=['Reporter_Code', 'Partner_Code', 'Value', 'Year'])

# Remove rows with NaN at important columns
clean_df = clean_df[clean_df['Reporter_Code'].apply(lambda x: is_float(x))]
clean_df = clean_df[clean_df['Partner_Code'].apply(lambda x: is_float(x))]
clean_df = clean_df[clean_df['Value'].apply(lambda x: is_float(x))]

# Print nbr of rows
print("Nbr of rows : " + str(len(clean_df.index)))

In [None]:
# Cast to type
clean_df['Reporter_Code'] = clean_df['Reporter_Code'].astype(int)
clean_df['Partner_Code'] = clean_df['Partner_Code'].astype(int)
clean_df['ImportOrExport'] = clean_df['ImportOrExport'].astype(int)
clean_df['Value'] = clean_df['Value'].astype(float)

In [None]:
# Only keep imports
imports_df = clean_df[clean_df['ImportOrExport'] == 0]
imports_df = imports_df.drop(columns=['ImportOrExport'])

In [None]:
def joinYear(year):
    return ",".join(year)

imports_df['Year'] = imports_df['Year'].apply(joinYear)

In [None]:
# Print nbr of rows
print("Nbr of rows : " + str(len(imports_df.index)))

# Preview df
imports_df.head(10)

In [None]:
# Export
output_path = "preproc_arms_import.csv"
imports_df.to_csv(output_path, sep=';')