# Scraping the NISAT Database

Norwegian Initiative on Small Arms Transfers (NISAT) is database of the international authorised trade in small arms and light weapons.

In [None]:
# Libs
import urllib3
import csv
import re

In [None]:
# --- NISAT Constants ---

# We need to add the country code between the prefix and suffix
exportsPrefixURL = "http://nisatapps.prio.org/Results_SQL.aspx?C1="
exportsSuffixURL = "&C2=-2&p=Exports&Dep1=0&Dep2=False&r=True&W=100&dtl=2&Y=All%20Years&d=99&t=3&dls=True&csv=True&EY=All%20Years&scp=3"

importsPrefixURL = "http://nisatapps.prio.org/Results_SQL.aspx?C1="
importsSuffixURL = "&C2=-2&p=Imports&Dep1=0&Dep2=False&r=True&W=100&dtl=2&Y=All%20Years&d=99&t=3&dls=True&csv=True&EY=All%20Years&scp=3"


# When Queried NISAT returns an html page with the following header
header = 'Reporter_Code,Reporter_Name,Partner_Code,Partner_Name,ImportOrExport,Year,Period_Start,Period_End,Weapons_Type,Units,Value,Currency,Licenses_Issued,Weight,Licenses_Refused,AuthOrDel,GovtOrInd,Data_Source,Reliability,Accuracy,SmallArmsOnly,Comment,GlobalComment'
nbrCols = len(header.split(","))

# The file containing the country codes
countryCodesFile = "data/world/COW country codes.csv"

In [None]:
# init instance of urllib
http = urllib3.PoolManager()

def httpGET(url):
    """
        Returns the content of a webpage
    """
    
    r = http.request('GET', url)
    
    if(r.status == 200):
            
        # Get data
        bytesData = r.data
        
        # Convert to UTF-8
        data = bytesData.decode('utf-8')
        
        return data
    
    else:
        print("ERROR: " + str(r.status))
        return None
        
        
def parseContent(html):
    """
        Parses a NISAT result page
    """
    
    # split at header
    content = html.split(header)
    
    if(len(content) > 1):
        content = "<BR>".join(content[1:])
    else:
        return None
    
    # split at <br>
    content = content.replace("<BR>","\n")
        
    # remove html tags
    content = re.sub('<[^<]+?>', '', content)
    
    # init final
    content_clean = []
    for row in content.split("\n"):
        
        # remove leading/trailing whitespaces
        row = row.strip()
        
        # if empty
        if(len(row) == 0):
            continue
        
        # get nbr of cols
        rowNbrCols = len(row.split(","))
        
        # if matches header
        if(rowNbrCols == nbrCols):
            content_clean.append(row)
            
            
    # concat
    content = "\n".join(content_clean)
    
    return content


def getCountryExports(countryCode):
    """
        Returns the NISAT exports parsed result page for a country code
    """
    
    # Create the URL
    url = exportsPrefixURL + str(countryCode) + exportsSuffixURL

    # Get content
    html = httpGET(url)

    # if success parse content
    if(html is not None):
        return parseContent(html)
    else:
        return None
    
    
def getCountryImports(countryCode):
    """
        Returns the NISAT imports parsed result page for a country code
    """
    
    # Create the URL
    url = importsPrefixURL + str(countryCode) + importsSuffixURL
    
    # Get content
    html = httpGET(url)

    # if success parse content
    if(html is not None):
        return parseContent(html)
    else:
        return None

In [None]:
def buildCountryCodesDict():
    """
        Reads all the country codes from the file and build a dictionary of their key and name
    """

    with open(countryCodesFile, 'r', newline='', encoding="utf-8") as csvfile:

        # init reader
        reader = csv.reader(csvfile, delimiter=',')

        # Taking the header of the file + the index of useful columns:
        header = next(reader)
        ind_abb = header.index('StateAbb')
        ind_cc = header.index('CCode')
        ind_name = header.index('StateNme')
        
        # init dict
        dictionary = {}
        
        # go through file
        for row in reader:
            
            # get data
            abb = row[ind_abb]
            cc = row[ind_cc]
            name = row[ind_name]
            
            # append to dict
            dictionary[cc] = name
            
        return dictionary

In [None]:
# Country code dict
ccDictionary = buildCountryCodesDict()

In [None]:
# Set the output path
output_path = "data/nisat/arms_trades_exports.csv"

with open(output_path, 'w+', newline='', encoding="utf-8") as outfile:

    # write header
    outfile.write(header + "\n")
    
    # go through countries
    for index, key in enumerate(ccDictionary):
        
        # get the name of the country
        cName = ccDictionary[key]
        
        # get data for that country
        cContent = getCountryExports(key)
        
        # if failed
        if(cContent is None):
            print("ERROR: " + str(cName))
            continue
        
        # write to file
        for row in cContent.split("\n"):
            outfile.write(row + "\n")
            
        # success
        print("DONE: " + str(cName))
            


In [None]:
# Set the output path
output_path = "data/nisat/arms_trades_imports.csv"

with open(output_path, 'w+', newline='', encoding="utf-8") as outfile:

    # write header
    outfile.write(header + "\n")
    
    # go through countries
    for index, key in enumerate(ccDictionary):
        
        # get the name of the country
        cName = ccDictionary[key]
        
        # get data for that country
        cContent = getCountryImports(key)
        
        # if failed
        if(cContent is None):
            print("ERROR: " + str(cName))
            continue
        
        # write to file
        for row in cContent.split("\n"):
            outfile.write(row + "\n")
            
        # success
        print("DONE: " + str(cName))