In [1]:
import re
import os
import csv
import math
import tensorflow as tf
import collections
import numpy as np
from time import time, mktime
from datetime import datetime, timedelta
from random import randint
from uuid import UUID
from tqdm.notebook import tqdm

from sklearn.metrics.pairwise import cosine_similarity
from transformers import (
    TFFlaubertForSequenceClassification,
    FlaubertTokenizer,
    FlaubertConfig
)

In [2]:
%run ../flaubert_token_classification.py

# Import dataset

In [3]:
tokenizer = FlaubertTokenizer.from_pretrained("jplu/tf-flaubert-base-cased")
model = TFFlaubertForTokenClassification.from_pretrained("../models/ner")

In [4]:
SEQUENCE_LENGTH=64

In [5]:
class Article():
    def __init__(self, raw, token, date, entities):
        self.cluster = None
        self.raw = raw
        self.token = token
        self.date = int(date)
        self.entities = entities
        
    def set_cluster(self, cluster):
        self.cluster = cluster

In [6]:
import denstream

dataset_ = open("../dataset/custom_dataset/since_january.csv")

sentences = []
reader = csv.reader(dataset_, delimiter=',', quotechar='"')

next(reader) # Skip header

for idx, line in tqdm(enumerate(reader)):
    if idx > 1001:
        break
    if len(line) < 3 or line[1] == "":
        continue

    article = line[0]
    tokens = tokenizer.encode(article, max_length=SEQUENCE_LENGTH, pad_to_max_length=SEQUENCE_LENGTH, add_special_tokens=True, return_tensors='tf')
    transformer_outputs = model.transformer(tokens)[0][0]
    token_classification_outputs = model(tokens)[0]
    token_classification_outputs = np.argmax(token_classification_outputs, axis=2)[0]

    # Get entities tokens
    entities = []
    for idx, entity in enumerate(token_classification_outputs):
        if entity != 8:
            entities.append(tokens[0][idx].numpy())

    sentences.append(
        Article(
            raw=article,
            token=transformer_outputs[0],
            date=datetime.strptime(line[1], "%Y-%m-%d %H:%M:%S").timestamp(),
            entities=entities
        )
    )    

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [7]:
document = sentences[0]
sentences = sentences[1:]

# Denstream algorithm

In [8]:
from denstream import Sample, OutlierDenStream
import pandas as pd

In [9]:
algo = OutlierDenStream(lamb=2, startingBuffer=[document.token.numpy()])

In [10]:
algo.runInitialization()

In [11]:
for idx, sentence in enumerate(sentences):
    algo.runOnNewSample(Sample(sentence.token.numpy(), idx))

  maxRad = np.nanmax(np.sqrt(SSd.astype(float)-LSd.astype(float)))


In [12]:
algo.pMicroCluster.show()

Number of Clusters: 15
-----
Cluster #0
Samples: 3
Weight: 5.527486543369371e-76
Creation Time: 280
LastEdit Time: 291
Cluster #1
Samples: 19
Weight: 4.280599731007093e-50
Creation Time: 193
LastEdit Time: 334
Cluster #2
Samples: 2
Weight: 1.2622544855513944e-29
Creation Time: 361
LastEdit Time: 368
Cluster #3
Samples: 4
Weight: 5.364273414875793e-29
Creation Time: 351
LastEdit Time: 369
Cluster #4
Samples: 6
Weight: 8.411152611240296e-25
Creation Time: 363
LastEdit Time: 376
Cluster #5
Samples: 4
Weight: 8.503965996670648e-22
Creation Time: 372
LastEdit Time: 381
Cluster #6
Samples: 6
Weight: 1.3877787807814457e-17
Creation Time: 301
LastEdit Time: 388
Cluster #7
Samples: 2
Weight: 5.55247037584139e-17
Creation Time: 383
LastEdit Time: 389
Cluster #8
Samples: 11
Weight: 5.684341886080802e-14
Creation Time: 242
LastEdit Time: 394
Cluster #9
Samples: 5
Weight: 3.814697265625e-06
Creation Time: 296
LastEdit Time: 407
Cluster #10
Samples: 6
Weight: 0.00024414062500089186
Creation Time: 37

In [13]:
for a in algo.oMicroCluster.clusters:
    print(len(a.getCenter()))
    print(a.getRadius())
    print(a.clusterNumber, "\n")

768
0.0005787647715686867
53 

