In [None]:
from hdx.utilities.easy_logging import setup_logging
from hdx.hdx_configuration import Configuration
from hdx.data.dataset import Dataset
import numpy as np
import pandas as pd

In [None]:
setup_logging()

In [None]:
Configuration.create(hdx_site='prod', user_agent='A_Quick_Example', hdx_read_only=True)

In [None]:
dataset = Dataset.read_from_hdx('acled-data-for-south-africa')
print(type(dataset))
print(dataset)

In [None]:
from hdx.facades.simple import facade

In [None]:
tags = dataset.get_tags()

In [None]:
print(tags)

In [None]:
organization = dataset.get_organization()

In [None]:
print(organization)

In [None]:
#uploading a new dataset
from fastText import load_model
from os import path, getcwd
import re, string
import pickle

'''
This program reads an untagged dataset and tags it using a trained classifier.

Input: Raw .xlsx file without tags from the HDX.
NOTE: This PoC has been written for .xlsx files but could easily be rewritten to handle other formats

Output: The same .xlsx but with an additional row containing the predicted hashtags
'''


def split_punctuation(value): # split strings on punctuation characters:
    table = str.maketrans(string.punctuation, " " * len(string.punctuation))
    return value.translate(table)


def lower_case_cond(value): # lowercase only words which are all uppercase
    word_list = value.split()
    for i, word in enumerate(word_list):
        if word.isupper():
            word_list[i] = word.lower()
    return ' '.join(word_list)


def split_uppercase(value): # split strings on uppercase
    return re.sub(r'([A-Z])', r' \1', str(value))


def remove_excess_whitespace(value):
    return ' '.join(value.split())


def format_header(header):
    header = str(header)
    header = split_punctuation(header)
    header = lower_case_cond(header)
    header = split_uppercase(header)
    header = remove_excess_whitespace(header)
    header = header.lower()
    return header


input_file = "ao-airports.xlsx"
pretrained_fasttext_model = 'wiki.en.bin'   # https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.zip

d = path.dirname('__file__')
df = pd.read_excel(path.join(d, "Unlabeled Test Data", input_file))    # Path to untagged dataset

# Preprocessing
headers = list(df)
headers = [format_header(x) for x in headers]

# Load word embedding model for feature generation
fastText_model = load_model(pretrained_fasttext_model)
print("Pre-trained model loaded successfully!\n")

# Convert dataset headers into word embeddings
headers = [fastText_model.get_sentence_vector(x).tolist() for x in headers]

In [None]:
def split_uppercase(value):     # split strings on uppercase
    return re.sub(r'([A-Z])', r' \1', str(value))


def lower_case_cond(value):     # lowercase only words which are all uppercase
    word_list = value.split()
    for i, word in enumerate(word_list):
        if word.isupper():
            word_list[i] = word.lower()
    return ' '.join(word_list)


def split_punctuation(value):   # split strings on punctuation characters:
    table = str.maketrans(string.punctuation, " " * len(string.punctuation))
    return value.translate(table)


def remove_excess_whitespace(value):
    return ' '.join(value.split())

input_file = "ao-airports_tagged.xls"
#input_file = "Unlabeled Test Data" + "/" + input_file
output_file = 'cleaned_hxl_data.csv'

df2 = pd.read_excel(input_file)
df2.head()
label = df2.iloc[0]

In [None]:
df2['Text header'] = df2.index.map(lambda x: str(x))
df2['Text header'] = df2['Text header'].map(lambda x: split_punctuation(x))
df2['Text header'] = df2['Text header'].map(lambda x: lower_case_cond(x))
df2['Text header'] = df2['Text header'].map(lambda x: split_uppercase(x))
df2['Text header'] = df2['Text header'].map(lambda x: remove_excess_whitespace(x))
df2['Text header'] = df2['Text header'].map(lambda x: x.lower())

header = df2[['Text header']]

training_data = pd.concat([label, header], axis=1)
training_data.to_csv(path.join(d, output_file), index=False, sep=',', encoding='utf-8', quotechar=' ')

In [None]:
# Read the cleaned HXL data

input_file = output_file
input_file = pd.read_csv(input_file , delimiter=',', encoding = 'utf-8')
df["Text_header"] = df["Text_header"].map(lambda x: re.sub(' +', ' ', str(x)))

# Get a vector representation of each header
df['Word_embedding'] = df['Text_header'].map(lambda x: fastText_model.get_sentence_vector(str(x)))
print("Word embeddings extracted!\n")

# Save the vectorized data
df.to_csv(output_file, sep=',', encoding='utf-8', index=False)

In [None]:
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sns

'''
This program trains an MLP classifier to predict HXL hashtags.
If you are training a classifier on a new dataset, it is adviced to first tune the parameters of the model.

Input: .csv file containing hashtags, header strings and their corresponding word embeddings from extract_features.py

Output: A pickled MLP classifier. Also the model is tested on a test set, the classification accuracy is printed 
along with the confusion matrix.
'''


def format_embeddings(embedding):
    """Fix some formatting issues from feature extraction"""
    embedding = embedding.replace('\r\n', '')
    embedding = embedding.replace('[', '')
    embedding = embedding.replace(']', '')
    return np.fromstring(embedding, dtype=float, sep=' ').tolist()


input_file = 'wordembedding_data.csv'
output_file = 'MLPclassifier.pkl'

# Read data
df = pd.read_csv(input_file, delimiter=',', encoding='utf-8')

df['Class'] = df['Hashtag']
df['Word_embedding'] = df['Word_embedding'].map(lambda x: format_embeddings(x))

# Remove infrequent classes
threshold = 5   # include only rows with at least this many points
class_count = df['Class'].value_counts()
removal = class_count[class_count <= threshold].index
df['Class'] = df['Class'].replace(removal, np.nan)
df = df.dropna()

df = df[['Class', 'Word_embedding']].copy()
df_labels = df.Class.unique()
df_labels = np.sort(df_labels, axis=-1)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Word_embedding'], df['Class'], test_size=0.33, random_state=0)


# Train the classifier with the parameters as specified
clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')
clf.fit(X_train.values.tolist(), y_train.values.tolist())
test_score = clf.score(X_test.tolist(), y_test.tolist())
print("Classification accuracy on test set: %s" %test_score)

# Confusion matrix
y_pred = clf.predict(X_test.values.tolist())
confmatrix = confusion_matrix(y_test.values.tolist(), y_pred, df_labels)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(confmatrix, annot=True, fmt='d', xticklabels=df_labels, yticklabels=df_labels, vmax=80)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# Save the trained classifier for later use
pickle.dump(clf, open(output_file, 'wb'))


In [None]:
# Load the pre-trained classifier
clf = pickle.load(open("MLPclassifier.pkl", 'rb'))

# Predict tags
tags = clf.predict(headers)

# Insert row of tags into the dataset
df.loc[-1] = tags
df.index = df.index + 1  # shifting index
df.sort_index(inplace=True)

writer = pd.ExcelWriter(path.join(d,"..","Unlabeled Test Data","Tagged-"+input_file), engine='xlsxwriter')
df.to_excel(writer, index=False)
writer.save()