In [None]:
from hdx.utilities.easy_logging import setup_logging
from hdx.hdx_configuration import Configuration
from hdx.data.dataset import Dataset

In [None]:
setup_logging()

In [None]:
Configuration.create(hdx_site='prod', user_agent='A_Quick_Example', hdx_read_only=True)

In [None]:
dataset = Dataset.read_from_hdx('acled-conflict-data-for-africa-1997-lastyear')
print(dataset.get_dataset_date())

In [None]:
datasets = Dataset.search_in_hdx('ACLED', rows=10)
print(datasets)
resources = Dataset.get_all_resources(datasets)
print(resources)

In [None]:
url, path = resources[0].download()
print('Resource URL %s downloaded to %s' % (url, path))

In [None]:
import pandas as pd
from os import path
import re, string


'''
This program pre-processes the data consisting of the headers of HDX datasets.
The idea is to prepare the strings for the word embedding model which works best if words are separated by 
blank space. Thus the main pre-processing steps are to split the strings on punctuation characters, split on
single capital letters, lowercase everything and remove excess whitespace.
Input: .xlsx file containing at least the columns 'Hashtag' and 'Text header'. It is recommended that the input file
be deduplicated so as not to include repetitions of identical file structures.
Output: .csv file where each row contains a hashtag and a cleaned header string
'''


def split_uppercase(value):     # split strings on uppercase
    return re.sub(r'([A-Z])', r' \1', str(value))


def lower_case_cond(value):     # lowercase only words which are all uppercase
    word_list = value.split()
    for i, word in enumerate(word_list):
        if word.isupper():
            word_list[i] = word.lower()
    return ' '.join(word_list)


def split_punctuation(value):   # split strings on punctuation characters:
    table = str.maketrans(string.punctuation, " " * len(string.punctuation))
    return value.translate(table)


def remove_excess_whitespace(value):
    return ' '.join(value.split())


input_file = 'hxl-hashtags-and-headers-DEDUPLICATED-20180807.xlsx'
output_file = 'cleaned_hxl_data.csv'


### d = path.dirname(__file__) ###

df = pd.read_excel(input_file)
label = df[['Hashtag']]

df['Text header'] = df['Text header'].map(lambda x: str(x))
df['Text header'] = df['Text header'].map(lambda x: split_punctuation(x))
df['Text header'] = df['Text header'].map(lambda x: lower_case_cond(x))
df['Text header'] = df['Text header'].map(lambda x: split_uppercase(x))
df['Text header'] = df['Text header'].map(lambda x: remove_excess_whitespace(x))
df['Text header'] = df['Text header'].map(lambda x: x.lower())

header = df[['Text header']]

training_data = pd.concat([label, header], axis=1)
training_data.to_csv(path.join(d, output_file), index=False, sep=',', encoding='utf-8', quotechar=' ')

In [None]:
from fastText import load_model
import pandas as pd
import re


'''
This program extracts features from the cleaned HXL headers by converting them to word embeddings.
The word embeddings used here are 300-dim fastText embeddings. They are loaded from a large (~10GB) fastText model
which can be downloaded here: https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.zip
Note that the .zip contains a .bin file and a .vec file. These are different formats for storing the fastText model,
use the .bin file whenever possible.
Input: .csv file where each row contains a hashtag and a cleaned header string
Output: .csv file containing hashtags, header strings and their corresponding word embeddings
NOTE: this output formatting is not ideal and currently has to be handled ad hoc in the program which trains the ML 
model. It should be changed to something more suitable for storing large vectors, e.g. .xml, .pickle, etc.
'''


input_file = 'cleaned_hxl_data.csv'
output_file = 'wordembedding_data.csv'
pretrained_fasttext_model = 'wiki.en.bin'   # https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.zip


# Load the fastText model
fastText_model = load_model(pretrained_fasttext_model)
print("Pre-trained model loaded successfully!\n")

# Read the cleaned HXL data
df = pd.read_csv(input_file , delimiter=',', encoding='utf-8')
df["Text_header"] = df["Text_header"].map(lambda x: re.sub(' +', ' ', str(x)))

# Get a vector representation of each header
df['Word_embedding'] = df['Text_header'].map(lambda x: fastText_model.get_sentence_vector(str(x)))
print("Word embeddings extracted!\n")

# Save the vectorized data
df.to_csv(output_file, sep=',', encoding='utf-8', index=False)

In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
import pickle

'''
This program tunes the parameters of the Multilayer Perceptron model through a crossvalidated gridsearch.
Input: .csv file containing hashtags, header strings and their corresponding word embeddings from extract_features.py
Output: A pickled MLP classifier. Also the optimal parameter values are printed.
'''


def format_embeddings(embedding):
    """Fix some formatting issues from feature extraction"""
    embedding = embedding.replace('\r\n', '')
    embedding = embedding.replace('[', '')
    embedding = embedding.replace(']', '')
    return np.fromstring(embedding, dtype=float, sep=' ').tolist()


input_file = 'wordembedding_data.csv'
output_file = 'MLPclassifier.pkl'


# Read and process data
df = pd.read_csv(input_file, delimiter=',', encoding='utf-8')

df['Class'] = df['Hashtag']
df['Word_embedding'] = df['Word_embedding'].map(lambda x: format_embeddings(x))

threshold = 5   # include only rows with at least this many points
class_count = df['Class'].value_counts()
removal = class_count[class_count <= threshold].index
df['Class'] = df['Class'].replace(removal, np.nan)
df = df.dropna()

df = df[['Class', 'Word_embedding']].copy()

X = df['Word_embedding'].values.tolist()
y = df['Class'].values.tolist()


# Parameter grid to search through
param_grid = [
    {
        'solver' : ['adam', 'lbfgs'],
        'alpha' : [0.001, 0.01, 0.1],
        'hidden_layer_sizes' : [50, 75, 100, 150, 200],
        'activation' : ['tanh', 'relu']
    }
]

# Tune parameters
clf = GridSearchCV(MLPClassifier(), param_grid, cv=3, scoring='accuracy', verbose=10)
clf.fit(X,y)
print("Best parameters set found on development set:")
print(clf.best_params_)

# Save trained classifier
pickle.dump(clf, open(output_file, 'wb'))


# 2018-08-20: Good parameter choices found to be:
# 'activation': 'relu', 'alpha': 0.001, 'epsilon': 1e-08, 'hidden_layer_sizes': 150, 'solver': 'adam'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sns
import pickle

'''
This program trains an MLP classifier to predict HXL hashtags.
If you are training a classifier on a new dataset, it is adviced to first tune the parameters of the model.
Input: .csv file containing hashtags, header strings and their corresponding word embeddings from extract_features.py
Output: A pickled MLP classifier. Also the model is tested on a test set, the classification accuracy is printed 
along with the confusion matrix.
'''


def format_embeddings(embedding):
    """Fix some formatting issues from feature extraction"""
    embedding = embedding.replace('\r\n', '')
    embedding = embedding.replace('[', '')
    embedding = embedding.replace(']', '')
    return np.fromstring(embedding, dtype=float, sep=' ').tolist()


input_file = 'wordembedding_data.csv'
output_file = 'MLPclassifier.pkl'

# Read data
df = pd.read_csv(input_file, delimiter=',', encoding='utf-8')

df['Class'] = df['Hashtag']
df['Word_embedding'] = df['Word_embedding'].map(lambda x: format_embeddings(x))

# Remove infrequent classes
threshold = 5   # include only rows with at least this many points
class_count = df['Class'].value_counts()
removal = class_count[class_count <= threshold].index
df['Class'] = df['Class'].replace(removal, np.nan)
df = df.dropna()

df = df[['Class', 'Word_embedding']].copy()
df_labels = df.Class.unique()
df_labels = np.sort(df_labels, axis=-1)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Word_embedding'], df['Class'], test_size=0.33, random_state=0)


# Train the classifier with the parameters as specified
clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')
clf.fit(X_train.values.tolist(), y_train.values.tolist())
test_score = clf.score(X_test.tolist(), y_test.tolist())
print("Classification accuracy on test set: %s" %test_score)

# Confusion matrix
y_pred = clf.predict(X_test.values.tolist())
confmatrix = confusion_matrix(y_test.values.tolist(), y_pred, df_labels)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(confmatrix, annot=True, fmt='d', xticklabels=df_labels, yticklabels=df_labels, vmax=80)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# Save the trained classifier for later use
pickle.dump(clf, open(output_file, 'wb'))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import wordcloud
from os import path

'''
This program generates wordclouds of the table headers associated with different HXL hashtags.
Input: .csv file containing at least the columns 'Hashtag' and 'Text header'
Output: A set of .png figures of the word clouds for the hashtags
'''


input_file = "hdx-hashtags-list.csv"    # This is the raw HXL csv I got from David Megginson


d = path.dirname(__file__)

# Read and process data
df = pd.read_csv(input_file)
df.columns = df.columns.str.lower()
cols = df.columns
cols = cols.map(lambda x: x.replace(' ', '_') if isinstance(x, (str, bytes)) else x)
df.columns = cols
df["text_header"] = df["text_header"].str.lower()
df["text_header"] = df["text_header"].replace('_', ' ', regex=True)

tagList = df.hashtag.unique()   # List all unique hashtags in the dataset
output = pd.DataFrame(columns=['Hashtag','Count','Unique headers','Score'])

i=0
for tag in tagList:
    # Compute various statistics
    df_tag = df.loc[df['hashtag'] == tag]
    count = df_tag.shape[0]
    unique = len(df_tag['text_header'].unique())
    output.loc[i] = [tag, count, unique, unique/count]
    i+=1

output_top100 = output.loc[output["Count"]>100]  # Create word clouds only for the tags with >100 occurrences

for index, row in output_top100.iterrows():
    # Create wordclouds
    hashtag = row['Hashtag']
    df_wc = df.loc[df['hashtag'] == hashtag]
    tuples = tuple([tuple(x) for x in df_wc.text_header.value_counts().reset_index().values])
    tuples = dict(tuples)
    cloud = wordcloud.WordCloud(background_color="white", max_font_size=40)
    cloud.generate_from_frequencies(tuples)
    plt.figure()
    plt.imshow(cloud, interpolation='bilinear')
    plt.axis("off")
    plt.text(70,230,"Hashtag Occurrence: %s, Unique Headers: %s" %(row['Count'], row['Unique headers']))
    plt.title(hashtag, fontsize=18)
    plt.savefig(path.join(d, "wordcloud", "wordcloud%s.png" %hashtag))
    plt.close()


In [None]:
'TAG NEW DATASET'

In [None]:
import pandas as pd
from fastText import load_model
from os import path
import re, string
import pickle

'''
This program reads an untagged dataset and tags it using a trained classifier.
Input: Raw .xlsx file without tags from the HDX.
NOTE: This PoC has been written for .xlsx files but could easily be rewritten to handle other formats
Output: The same .xlsx but with an additional row containing the predicted hashtags
'''


def split_punctuation(value): # split strings on punctuation characters:
    table = str.maketrans(string.punctuation, " " * len(string.punctuation))
    return value.translate(table)


def lower_case_cond(value): # lowercase only words which are all uppercase
    word_list = value.split()
    for i, word in enumerate(word_list):
        if word.isupper():
            word_list[i] = word.lower()
    return ' '.join(word_list)


def split_uppercase(value): # split strings on uppercase
    return re.sub(r'([A-Z])', r' \1', str(value))


def remove_excess_whitespace(value):
    return ' '.join(value.split())


def format_header(header):
    header = str(header)
    header = split_punctuation(header)
    header = lower_case_cond(header)
    header = split_uppercase(header)
    header = remove_excess_whitespace(header)
    header = header.lower()
    return header


input_file = "data.xlsx"
pretrained_fasttext_model = 'wiki.en.bin'   # https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.zip

d = path.dirname(__file__)
df = pd.read_excel(path.join(d, "..", "Unlabeled Test Data", input_file))    # Path to untagged dataset

# Preprocessing
headers = list(df)
headers = [format_header(x) for x in headers]

# Load word embedding model for feature generation
fastText_model = load_model(pretrained_fasttext_model)
print("Pre-trained model loaded successfully!\n")

# Convert dataset headers into word embeddings
headers = [fastText_model.get_sentence_vector(x).tolist() for x in headers]

# Load the pre-trained classifier
clf = pickle.load(open("MLPclassifier.pkl", 'rb'))

# Predict tags
tags = clf.predict(headers)

# Insert row of tags into the dataset
df.loc[-1] = tags
df.index = df.index + 1  # shifting index
df.sort_index(inplace=True)

writer = pd.ExcelWriter(path.join(d,"..","Unlabeled Test Data","Tagged-"+input_file), engine='xlsxwriter')
df.to_excel(writer, index=False)
writer.save()
