In [1]:
import io
import os
import pickle
import random as rn
import warnings
from datetime import datetime
from importlib import reload
from multiprocessing import cpu_count

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from gensim.models import Word2Vec
from gensim.models.fasttext import FastText
from IPython.display import display
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, ndcg_score, \
        label_ranking_average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model

import data
import models
import preprocessing

seed = 42
sns.set()

def reset_seed():
    os.environ["PYTHONHASHSEED"] = str(seed)
    os.environ["TF_DETERMINISTIC_OPS"] = "1"
    np.random.seed(seed)
    rn.seed(seed)
    tf.random.set_seed(seed)

In [3]:
version = "tokenized_cased"
# version = "tokenized_no_sw_no_punct"
# version = "tokenized_lemmatized_no_sw_no_punct"

In [2]:
# data.extract_data(extraction_dir="train",
#                   data_dir="data",
#                   data_zip_name="reuters-training-corpus.zip")

train_df = pd.read_pickle("train/data.pkl")

# train_df = data.get_docs_labels("train/REUTERS_CORPUS_2")
# train_df.to_pickle("train/data.pkl")

train_docs = train_df["doc"].values
n_train = train_docs.shape[0]
train_labels = np.array(train_df["labels"].tolist())
n_labels = len(data.CODEMAP)

print(train_docs.shape)
print(train_labels.shape)
print(train_docs[2])
print(train_labels[2])

(299773,)
(299773, 126)
Toronto stocks end higher after volatile session. CHANGE				    CHANGE TSE	  5900.37    +50.15   HI 5900.37	    LO  5840.29 DJI	  6611.05    +27.57   GOLD (LONDON)   US$350.00 +1.90 FTSE100    4248.10    -64.80   GOLD (NY-COMEX) US$354.80 +0.70 NIKKEI    17869.59   -133.81   LME CASH NICKEL US$7659   +99.0 CANDLR	1.3883		 LME CASH ALUM   US$1602.0  -4.0 CAN 30-YR   107.41     -0.15   BRENT CRUDE     US$19.09  -0.27 --------------------MARKET COMMENT---------------------------- * Toronto stocks ended higher on Tuesday, buoyed by strength in golds and banking * Computer problems due to heavy trading in Bre-X Minerals hampered session * 84 million shares traded Toronto's key stock index ended higher on Tuesday as the saga of Bre-X Minerals Ltd and its Indonesian gold find continued to dominate Canada's biggest stock market. The TSE 300 Index climbed 50.15 points to close at 5900.37 in heavy turnover of 84.07 million shares worth C$1.4 billion. But the overall marke

In [5]:
n_samples = None
x_train, y_train = shuffle(train_docs[:n_train],
                           train_labels,
                           random_state=seed,
                           n_samples=n_samples)
x_test = train_docs[n_train:]

In [23]:
mskf = MultilabelStratifiedKFold(n_splits=5, random_state=seed)
train, val = next(mskf.split(x_train, y_train))

x_val = x_train[val]
y_val = y_train[val]
y_pred = np.loadtxt("val_results/cnn_bi_lstm_2_tokenized_cased_1590709063_0.894295.txt").astype(int)

print(x_val.shape)
print(y_val.shape)
print(y_pred.shape)

print(f1_score(y_val, y_pred, average="micro"))

(59953,)
(59953, 126)
(59953, 126)
0.8942945693502178


In [25]:
print(data.CODEMAP)

{'1POL': 0, '2ECO': 1, '3SPO': 2, '4GEN': 3, '6INS': 4, '7RSK': 5, '8YDB': 6, '9BNX': 7, 'ADS10': 8, 'BNW14': 9, 'BRP11': 10, 'C11': 11, 'C12': 12, 'C13': 13, 'C14': 14, 'C15': 15, 'C151': 16, 'C1511': 17, 'C152': 18, 'C16': 19, 'C17': 20, 'C171': 21, 'C172': 22, 'C173': 23, 'C174': 24, 'C18': 25, 'C181': 26, 'C182': 27, 'C183': 28, 'C21': 29, 'C22': 30, 'C23': 31, 'C24': 32, 'C31': 33, 'C311': 34, 'C312': 35, 'C313': 36, 'C32': 37, 'C33': 38, 'C331': 39, 'C34': 40, 'C41': 41, 'C411': 42, 'C42': 43, 'CCAT': 44, 'E11': 45, 'E12': 46, 'E121': 47, 'E13': 48, 'E131': 49, 'E132': 50, 'E14': 51, 'E141': 52, 'E142': 53, 'E143': 54, 'E21': 55, 'E211': 56, 'E212': 57, 'E31': 58, 'E311': 59, 'E312': 60, 'E313': 61, 'E41': 62, 'E411': 63, 'E51': 64, 'E511': 65, 'E512': 66, 'E513': 67, 'E61': 68, 'E71': 69, 'ECAT': 70, 'ENT12': 71, 'G11': 72, 'G111': 73, 'G112': 74, 'G113': 75, 'G12': 76, 'G13': 77, 'G131': 78, 'G14': 79, 'G15': 80, 'G151': 81, 'G152': 82, 'G153': 83, 'G154': 84, 'G155': 85, 'G156

In [27]:
for i in range(len(x_val)):
    if not np.array_equal(y_val[i], y_pred[i]):
        print(i)
        print(x_val[i])
        print([j for j in range(len(y_val[i])) if y_val[i][j] == 1])
        print([j for j in range(len(y_pred[i])) if y_pred[i][j] == 1])
        print(y_val[i])
        print(y_pred[i])
        print()
        if i > 300:
            break

2
Libya's Gaddafi arrives by air for Niger prayers. Libyan leader Muammar Gaddafi arrived by Libyan airlines plane in the capital of the impoverished West African country of Niger on Thursday to lead Moslem prayers, witnesses said. The United Nations imposed a ban on flights from Libya in 1992 over the Lockerbie bombing of an airliner in 1988. The control tower at Niamey's international airport declined to comment on whether the ban had been broken. Gaddafi and his entourage arrived in four Boeing 727 jets and were met by Niger's president. "I came to bring support and sympathy to President Ibrahim Bare Mainassara, to the government and Nigerien people," Gaddafi told reporters on arrival. The Libyan news agency JANA on Wednesday said Gaddafi would lead prayers in both Niger and Nigeria during a visit to West Africa as leader of the Islamic Popular Leadership, an Islamic organisation he created in 1989. Gaddafi's last trip abroad was by land to Tunisia in October 1996. He earlier violat