In [1]:
import re
import os
import csv
import math
import tensorflow as tf
import collections
import numpy as np
import pandas as pd
from time import time, mktime
from datetime import datetime, timedelta
from random import randint
from uuid import UUID
from tqdm.notebook import tqdm

from scipy.spatial.distance import cosine
from uuid import uuid4
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity

from transformers import (
    TFFlaubertForSequenceClassification,
    FlaubertTokenizer,
    FlaubertConfig
)

In [2]:
%run ../flaubert_token_classification.py

# Import dataset

In [3]:
tokenizer = FlaubertTokenizer.from_pretrained("jplu/tf-flaubert-base-cased")
model = TFFlaubertForTokenClassification.from_pretrained("../models/ner")

In [4]:
SEQUENCE_LENGTH=64

In [5]:
class Article():
    def __init__(self, raw, token, date, entities):
        self.cluster = None
        self.raw = raw
        self.token = token
        self.date = int(date)
        self.entities = entities
        
    def set_cluster(self, cluster):
        self.cluster = cluster

In [6]:
dataset_ = open("../dataset/custom_dataset/since_january.csv")

sentences = []
reader = csv.reader(dataset_, delimiter=',', quotechar='"')

next(reader) # Skip header

for idx, line in tqdm(enumerate(reader)):
    if idx > 1001:
        break
    if len(line) < 3 or line[1] == "" or line[0] == "":
        continue

    article = line[0]
    tokens = tokenizer.encode(article, max_length=SEQUENCE_LENGTH, pad_to_max_length=SEQUENCE_LENGTH, add_special_tokens=True, return_tensors='tf')
    transformer_outputs = model.transformer(tokens)[0][0]
    token_classification_outputs = model(tokens)[0]
    token_classification_outputs = np.argmax(token_classification_outputs, axis=2)[0]

    # Get entities tokens
    entities = []
    for idx, entity in enumerate(token_classification_outputs):
        if entity != 8:
            entities.append(tokens[0][idx].numpy())

    sentences.append(
        Article(
            raw=article,
            token=transformer_outputs[0],
            date=datetime.strptime(line[1], "%Y-%m-%d %H:%M:%S").timestamp(),
            entities=entities
        )
    )    

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

KeyboardInterrupt: 

In [7]:
document = sentences[0]
sentences = sentences[1:]

# Tests distances

## Euclidean distances

In [8]:
t1 = time()

def l1_normalize(x):
    norm = np.sum(x)
    return x / norm


def l2_normalize(x):
    norm = np.sqrt(np.sum(np.square(x)))
    return x / norm


def euclidean_distance(X, Y):
    dists = -2 * np.dot(Y, X.T) + np.sum(X**2, axis=1) + np.sum(Y**2, axis=1)[:, np.newaxis]
    return dists


matrix = []
for s in sentences:
    vec = euclidean_distance(
        np.array(s.token).reshape(1, -1),
        np.array(document.token).reshape(1, -1)
    )
    l1_vec_norm = euclidean_distance(
        l1_normalize(np.array(s.token).reshape(1, -1)),
        l1_normalize(np.array(document.token).reshape(1, -1))
    )
    l2_vec_norm = euclidean_distance(
        l2_normalize(np.array(s.token).reshape(1, -1)),
        l2_normalize(np.array(document.token).reshape(1, -1))
    )
    matrix.append((vec[0][0], l1_vec_norm[0][0], l2_vec_norm[0][0]))

df = pd.DataFrame(matrix)

elapsed = time() - t1
print('Elapsed time is %f seconds.' % elapsed)

Elapsed time is 2.625801 seconds.


In [9]:
print(df.describe(include='all'), "\n")

print(document.raw)
for a in df.idxmax():
    print(sentences[a].raw)

                 0            1            2
count  3218.000000  3218.000000  3218.000000
mean    700.121780     0.081105     0.411446
std     179.988085     0.016685     0.078407
min     146.986084     0.018957     0.094882
25%     568.133850     0.069551     0.359838
50%     671.377441     0.079371     0.406025
75%     812.632935     0.091679     0.458499
max    1335.803955     0.194094     0.839261 

Retraites. L’intersyndicale appelle à de nouvelles actions les 14, 15 et 16 janvier
Donald Trump avait-il le droit d'agir unilatÃ©ralement pour Ã©liminer Soleimani?
En finir avec l’abandon  des  animaux de compagnie
Essai : impeachment et vaincus de l'histoire


## Cosine distance

### Numpy

In [10]:
matrix = []

t1 = time()

matrix = []
for s in sentences:
    vec = cosine_similarity(np.array(document.token).reshape(1, -1), np.array(s.token).reshape(1, -1))[0][0]
    matrix.append(vec)
elapsed = time() - t1
print('Elapsed time is %f seconds.' % elapsed)

Elapsed time is 0.819467 seconds.


In [11]:
serie = pd.Series(matrix)
print(serie.describe(), "\n")

print(document.raw)
print(sentences[serie.idxmax()].raw)

count    3218.000000
mean        0.794277
std         0.039203
min         0.580369
25%         0.770750
50%         0.796988
75%         0.820081
max         0.952559
dtype: float64 

Retraites. L’intersyndicale appelle à de nouvelles actions les 14, 15 et 16 janvier
Retraites : l'intersyndicale appelle à de nouvelles actions les 14, 15 et 16 janvier


### Scipy

In [12]:
matrix = []

t1 = time()

matrix = []
for s in sentences:
    vec = 1 - cosine(np.array(document.token).reshape(1, -1), s.token)
    matrix.append(vec)
elapsed = time() - t1
print('Elapsed time is %f seconds.' % elapsed)

Elapsed time is 0.239053 seconds.


In [13]:
df = pd.DataFrame(matrix)
print(df.describe(include='all'), "\n")

print(document.raw)
for a in df.idxmax():
    print(sentences[a].raw)

                 0
count  3218.000000
mean      0.794277
std       0.039203
min       0.580369
25%       0.770750
50%       0.796988
75%       0.820081
max       0.952559 

Retraites. L’intersyndicale appelle à de nouvelles actions les 14, 15 et 16 janvier
Retraites : l'intersyndicale appelle à de nouvelles actions les 14, 15 et 16 janvier


assert False

In [14]:
print(document.raw, document.date)

Retraites. L’intersyndicale appelle à de nouvelles actions les 14, 15 et 16 janvier 1578611081


# Compute time similarity

https://almeta.io/en/blog/news-stream-clustering-sequential-clustering-in-action/

for a given μ and σ > 0. For each document d and cluster c, we generate the following three-dimensional vector γ(d, c) = (s1 , s2, s3):

- s1 = f(t(d) − n1(c)) where t(dj) is the timestamp for document d and n1(c) is the timestamp for the newest document in cluster c.
- s2 = f(t(d)−n2(c)) where n2(c) is the average timestamp for all documents in cluster c.
- s3 = f(t(d) − n3(c)) where n3(c) is the timestamp for the oldest document in cluster c.
The vector q1 denotes the weights for the timestamp features

These three timestamps features model the time aspect of the online stream of news data and help disambiguate clustering decisions since time is a valuable indicator that a news story has changed, even if a cluster representation has a reasonable match in the textual features with the incoming document.

Regarding the hyper-parameters related to the timestamp features, we can fix μ = 0 and tune σ on the development set.

In [15]:
newest_document_in_cluster = sentences[randint(0, len(sentences))].date
print(newest_document_in_cluster)

1578628500


In [39]:
def time_similarity(t, micro, sigma):
    a = (t - micro)**2
    b = 2*(sigma**2)
    return math.exp(-(a/b))

In [17]:
S1 = datetime.now().timestamp() - (datetime.now() - timedelta(hours = 1)).timestamp()
S2 = datetime.now().timestamp() - (datetime.now() - timedelta(hours = 1)).timestamp()
S3 = datetime.now().timestamp() - (datetime.now() - timedelta(hours = 1)).timestamp()

In [18]:
print("S1 :",  time_similarity(S1, 0, 10000))
print("S2 :",  time_similarity(S2, 0, 10000))
print("S4 :",  time_similarity(S3, 0, 10000))

12959965.504669255
200000000
0.06479982752334627
S1 : 0.9372550572672796
12959999.979400635
200000000
0.06479999989700318
S2 : 0.9372548957092119
12959999.979400635
200000000
0.06479999989700318
S4 : 0.9372548957092119


# Clustering

In [31]:
class Cluster():
    id = ""
    articles = []
    mean = []
    entities = []
    creation_timestamp = 0
    last_update_timestamp = 0
    average_timestamp = 0

    def __init__(self, article):
        self.articles = [article]
        self.mean = [article.token]
        self.id = uuid4()
        if article.entities:
            self.entities = article.entities
        else:
            self.entities = []
        self.creation_timestamp = article.date
        self.last_update_timestamp = article.date
        self.average_timestamp = article.date

    def add_in_cluster(self, article):
        self.articles.append(article)
        self.mean = [np.mean([article.token for article in self.articles], axis=0)]
        if article.entities:
            self.entities.extend(article.entities)
            self.entities = list(set(self.entities))
        self.last_update_timestamp = article.date
        self.average_timestamp = sum([article.date for article in self.articles]) / len(self.articles)
        #[np.mean([article.token for article in self.articles], axis=0)]

    def get_mean(self):
        return self.mean
        
    def delete_cluster():
        raise NotImplemented
    
    def __str__(self):
        return "Cluster " + str(id) + " has len : " + str(len(self.articles))

## Numpy cosine clustering

In [20]:
cls_weight=0.8
ner_weight=0.2
time_weight=0.0

def compute_cluster_distance(document, cluster):
    cls_score = cosine_similarity(np.array(document.token).reshape(1, -1), cluster.get_mean())[0][0]
    time_score = 0

    ml = max(len(document.entities), len(cluster.entities))        
    ner_score = jaccard_score(
        np.concatenate((document.entities , np.zeros(ml - len(document.entities)))).astype(int),
        np.concatenate((cluster.entities , np.zeros(ml - len(cluster.entities)))).astype(int),
        average='macro'
    )
    return cls_score * cls_weight + ner_score * ner_weight + time_score * time_weight


In [21]:
clusters = []
def predict(document, T=0.95, ):
    if len(clusters) == 0:
        clusters.append(Cluster(document))
        return

    scores = []
    for cluster in clusters:
        scores.append(compute_cluster_distance(document, cluster))

    if max(scores) > T:
        max_idx = np.argmax(scores)
        clusters[max_idx].add_in_cluster(document)
    else:
        clusters.append(Cluster(document))

In [22]:
print(clusters)

for a in tqdm(range(0, 100)):
    predict(sentences[a], T=0.98)
#print([cluster ])

[]


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

article : [4552, 1406, 6775, 21087]
cluster : [4552, 1406, 6775, 21087]


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


article : [3798, 1502, 5468, 22648]
cluster : [3798, 1502, 5468, 22648]
article : [4198, 28, 1190]
cluster : [4198, 28, 1190]



In [23]:
print(len(clusters))
for cluster in clusters:
    if len(cluster.articles) > 1:
        print([article.raw for article in cluster.articles])

97
["Fumigènes: Saint-Etienne s'en sort avec un match ferme", "Fumigènes: Saint-Etienne s'en sort avec un match ferme"]
["Projet d'accord en Irlande du Nord pour sortir du blocage politique", 'Projet d’accord en Irlande du Nord pour sortir du blocage politique']
['Brésil: la Cour suprême donne raison à Netflix face à la censure', 'Brésil: la Cour suprême donne raison à Netflix face à la censure']


## Scipy cosine clustering

In [67]:
cls_weight=0.9
ner_weight=0.1
time_weight=0.0

def test_compute_cluster_distance(document, cluster):
    cls_score = 1 - cosine(document.token, cluster.get_mean())
    time_score = 0

    ml = max(len(document.entities), len(cluster.entities))        
    ner_score = jaccard_score(
        np.concatenate((document.entities , np.zeros(ml - len(document.entities)))).astype(int),
        np.concatenate((cluster.entities , np.zeros(ml - len(cluster.entities)))).astype(int),
        average='macro'
    )

    #S1 = time_similarity(document.date, cluster.last_update_timestamp, 10)
    #S2 = time_similarity(document.date, cluster.average_timestamp, 10)
    #S3 = time_similarity(document.date, cluster.creation_timestamp, 10)
    #print(round(sum([S1, S2, S3]), 3))

    return cls_score * cls_weight + ner_score * ner_weight + time_score * time_weight

In [68]:
def test_predict(document, T=0.95):
    if len(clusters) == 0:
        clusters.append(Cluster(document))
        return

    scores = []
    for cluster in clusters:
        scores.append(test_compute_cluster_distance(document, cluster))

    if max(scores) > T:
        max_idx = np.argmax(scores)
        clusters[max_idx].add_in_cluster(document)
    else:
        clusters.append(Cluster(document))

In [69]:
clusters = []

for a in tqdm(range(0, 40)):
    test_predict(sentences[a])
#print([cluster ])

HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))

0.0
0.0
0.0
0.0
0.4
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.25
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.08333333333333333
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.2
0.0
0.0
0.2
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.2222222222222222
0.2222222222222222
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.1111111111111111
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.6666666666666666
0.0
0.16666666666666666
0.0
0.1111111111111111
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [70]:
print(len(clusters))
for cluster in clusters:
    if len(cluster.articles) > 1:
        print([article.raw for article in cluster.articles])

40
