In [None]:
from hdx.utilities.easy_logging import setup_logging
from hdx.hdx_configuration import Configuration
from hdx.data.dataset import Dataset
import os
import numpy as np
import pandas as pd


In [None]:
setup_logging()
Configuration.create(hdx_site='prod', user_agent='A_Quick_Example', hdx_read_only=True)

In [None]:
#Check if the dataset has at least 1 resource of the required file type(s).

def check_type(dataset, file_types=[]):
    temp_dataset = Dataset.read_from_hdx(dataset)
    temp_dataset.separate_resources()
    if (len(temp_dataset.resources) > 0):
        if (len(file_types) > 0):
            if (not set(temp_dataset.get_filetypes()).isdisjoint(file_types)): 
                    return True
        else :
            return True
    return False

In [None]:
# Check if the dataset is tagged with HXL tag, not provided by HXL

def check_organization(dataset):
    if dataset.get_organization()['title'] != 'Humanitarian Exchange Language(HXL)':
            return True
    return False

In [None]:
# Download one dataset with certain type(s), read it into Dataframe, 
# add all headers, tags and dataset names to our DataFrame,
# and delete the dataset

def process_dataset(dataset, file_type, dataframe, download_path, index):
    global count
# Download one dataset and read it into a DataFrame 
    if (file_type == None):
        url, path = dataset_1.resources[0].download(download_path)
        pandas_dataset = pd.read_csv(path)
    else:
        if (file_type not in dataset.get_filetypes()):
            return 'Error: Required file type not in dataset OR dataset does not contain any resources.'
        try:
            url, path = dataset.resources[dataset.get_filetypes().index(file_type)].download(download_path)
            print('Resource URL %s downloaded to %s' % (url, path))
            pandas_dataset = pd.read_csv(path, encoding='latin-1')
        except:
            return 'Unknown error.'
     
    if "HXL" in os.path.basename(path) or "hxl" in os.path.basename(path):
        return
    dataset_df = pandas_dataset.head()
    
# Add headers and tags to our DataFrame
    if len(dataset_df) > 2:
        headers = list(dataset_df.columns.values)
        tags = list(dataset_df.iloc[0,:])
        for i in range(len(headers)):
            try:
                dic = {'Header': headers[i], 'Tag': tags[i], 'Dataset_name': os.path.basename(path), 'Index': index}
                headers_and_tags.loc[len(headers_and_tags)] = dic
            except:
                print("Error: different number of headers and tags")
        count += 1
    os.remove(path)
    print("File Removed!")
    return

In [None]:
# Search for all datasets with HXL tags

datasets_HXL = Dataset.search_in_hdx('HXL')
len(datasets_HXL)

In [None]:
# Create a DataFrame for all headers and tags

col_names = ['Header', 'Tag', 'Dataset_name', 'Index']
headers_and_tags= pd.DataFrame(columns = col_names)

In [None]:
# Creat a temp DataFrame

col_names = ['Header', 'Tag', 'Dataset_name', 'Index']
temp = pd.DataFrame(columns = col_names)

In [None]:
#This is the template for replicating the processing function multiple times
#for i in range(1000):
    #process_dataset(datasets_HXL[i], 'CSV', headers_and_tags, './datasets', count)


In [None]:
datasets_HXL_2 = datasets_HXL[1000:1500]

In [None]:
count = 665
for i in range(len(datasets_HXL_2)):
    process_dataset(datasets_HXL_2[i], 'CSV', temp, 'HDX File Holder', count)

In [None]:
headers_and_tags.to_excel("headerandtag.xlsx")

In [None]:
hxl_file = pd.read_excel("headerandtag.xlsx")


In [None]:
hxl_group1 = hxl_file.groupby(by=['Tag']).count().drop(columns=['Unnamed: 0','Dataset_name','Index']).sort_values(by=['Header'],ascending=False)
hxl_group2 = hxl_file.groupby(['Tag'])['Header'].apply(lambda tags: ','.join(tags))


In [None]:
##Splits HXL Tags into Tags and additional attributes
def splitter_tag(string):
    return string.split('+')

In [None]:
tag_column = hxl_file['Tag']
stringed = tag_column.apply(str)
splitted_tag = stringed.apply(splitter_tag)

In [None]:
tags_only = [i[0] for i in splitted_tag]

In [None]:
#Gives a count in descending order of only the HXL Tags in the Dataset
hxl_file['Tags']= tags_only
hxl_tag_count = hxl_file.drop(columns=['Unnamed: 0','Tag','Dataset_name','Index']).groupby(['Tags']).count().sort_values(by='Header',ascending=False)


In [None]:
#Groups the Headers by the Tags they are associated with
headers_by_tag = hxl_file.groupby(['Tags'])['Header'].apply(lambda headers: ','.join(headers))

In [None]:
#Downloads the above as a CSV
headers_by_tag.to_csv('headers_by_tag.csv')