In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import community
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import gc

from IPython.display import HTML
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from collections import defaultdict

%matplotlib inline

In [14]:
from utils import read_article_df, load_label, load_entities, max_cluster_metric, print_result_df, merge_small_clusters

In [None]:
bert = pd.read_csv('../data/embedding/BERT_EMBEDDING_withindex.csv')

In [5]:
bert.shape

(208518, 770)

In [None]:
bert = bert.set_index('article_index')

In [9]:
bert = bert.drop('Unnamed: 0', axis=1)

## Config

In [10]:
SAMPLE_SIZE = 10000
EDGE_PERCENTILE = 3 # 0 - 100

## Prepare Data

In [11]:
article_df = read_article_df(open('../data/raw/2018_07_19_04_59_08/articles.txt', encoding='utf-8'))

article_df['title'] = article_df.title.apply(lambda s: s.strip())
article_df['text'] = article_df.text.apply(lambda s: s.strip())
article_df['title_len'] = article_df.title.apply(len)
article_df['text_len'] = article_df.text.apply(len)

label_df = load_label('../data/raw/labels', 'lower_bound')

clean_df = article_df[article_df.title_len > 0]
clean_df = clean_df[clean_df.text_len > 100]
clean_df = clean_df[clean_df.lang_iso == 'en']
clean_df = clean_df.merge(label_df, on='canonicalUrl', how='left')

article_with_label = clean_df[~clean_df.label.isna()]
article_without_label = clean_df[clean_df.label.isna()]
article_with_label.shape, article_without_label.shape

sample_df = pd.concat([article_with_label, article_without_label.sample(SAMPLE_SIZE)])
sample_df.shape

(10140, 10)

In [12]:
sample_df.head()

Unnamed: 0,pubId,canonicalUrl,firstScrape,title,text,lang_reliability,lang_iso,title_len,text_len,label
62,290,zerohedge.com/news/2018-07-19/fbi-chief-threat...,7/19/2018 8:26:52 AM -04:00,FBI Chief Threatens To Quit If Trump Invites R...,"by Knave Dave - Jul 18, 2018 1:11 pm ### This ...",1,en,78,2858,helsinki
10736,33,washingtonpost.com/news/morning-mix/wp/2018/07...,7/19/2018 11:51:57 PM -04:00,At least 8 reported dead as duck boat sinks ne...,At least 8 reported dead as duck boat sinks ne...,1,en,90,856,duckboat
10841,33,washingtonpost.com/news/posteverything/wp/2018...,7/19/2018 6:27:03 AM -04:00,"Ukraine’s not a country, Putin told Bush. What...",PostEverything Perspective ### Perspective Int...,1,en,79,8487,helsinki
13353,237,hotair.com/archives/2018/07/19/looking-glass-d...,7/19/2018 1:35:59 PM -04:00,"Through the looking glass: Democrats attack ""R...",Through the looking glass: Democrats attack “R...,1,en,86,4440,helsinki
16345,118,philly.com/philly/news/nation_world/20180719_a...,7/19/2018 11:15:42 PM -04:00,Sheriff: 8 people dead after Missouri tourist ...,Sheriff: 8 people dead after Missouri tourist ...,1,en,68,766,duckboat


## Clustering

### Entities
### Generate embedding

In [107]:
entities = load_entities()
vect = TfidfVectorizer()
entity_words = [' '.join(entity) for entity in entities]
entities_tfidf = vect.fit_transform(entity_words)
sample_entities_tfidf = entities_tfidf[sample_df.index]

### Generate Graph

In [15]:
def generate_graph(emb) -> nx.Graph:
    pct = np.percentile(euclidean_distances(emb), EDGE_PERCENTILE)
    nn = NearestNeighbors(radius=pct)
    nn.fit(emb)
    admat = nn.radius_neighbors_graph()
    print(admat.__repr__())
    G = nx.from_scipy_sparse_matrix(admat)
    return G

In [114]:
G = generate_graph(sample_entities_tfidf)
resolution = 1
partition = community.best_partition(G, resolution=resolution)
sample_df['cluster__entity_tfidf'] = pd.Series([v for k,v in partition.items()], index=sample_df.index)
merge_small_clusters(sample_df, 'cluster__entity_tfidf', threshold=5)

<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 2606698 stored elements in Compressed Sparse Row format>


### Doc2Vec

In [115]:
doc2vec = pd.read_csv('../data/embedding/Doc2Vec_embedding_output.csv',index_col=0)
sample_doc2vec = doc2vec.loc[sample_df.index]
G_doc2vec = generate_graph(sample_doc2vec)
resolution = 1
partition_doc2vec = community.best_partition(G_doc2vec, resolution=resolution)
sample_df['cluster__doc2vec'] = pd.Series([v for k,v in partition_doc2vec.items()], index=sample_df.index)
merge_small_clusters(sample_df, 'cluster__doc2vec', threshold=5)

<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3074448 stored elements in Compressed Sparse Row format>


### BoW

In [30]:
bow = np.load('../data/embedding/BagOfWord_output.npy', allow_pickle=True)
bow = np.asmatrix(bow)[0,0]
sample_bow = bow[sample_df.index]
G_bow = generate_graph(sample_bow)
resolution = 1
partition_bow = community.best_partition(G_bow, resolution=resolution)
sample_df['cluster__bow'] = pd.Series([v for k,v in partition_bow.items()], index=sample_df.index)
merge_small_clusters(sample_df, 'cluster__bow', threshold=5)

<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3107624 stored elements in Compressed Sparse Row format>


## BERT

In [28]:
sample_bert = bert.loc[sample_df.index]
sample_bert = sample_bert.fillna(0)
G_bert = generate_graph(sample_bert)
resolution = 1
partition_bert = community.best_partition(G_bert, resolution=resolution)
sample_df['cluster__bert'] = pd.Series([v for k,v in partition_bert.items()], index=sample_df.index)
merge_small_clusters(sample_df, 'cluster__bert', threshold=5)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3074446 stored elements in Compressed Sparse Row format>


## BoW Lemmatize

In [37]:
def add_embedding(emb, name):
    sample_emb = emb[sample_df.index]
    G = generate_graph(sample_emb)
    resolution = 1
    partition = community.best_partition(G, resolution=resolution)
    sample_df[f'cluster__{name}'] = pd.Series([v for k,v in partition.items()], index=sample_df.index)
    merge_small_clusters(sample_df, f'cluster__{name}', threshold=5)
    
    
bow_lemma = np.load('../data/embedding/BagOfWord2_addlemmatization_output.npy', allow_pickle=True)
bow_lemma = np.asmatrix(bow_lemma)[0,0]
add_embedding(bow_lemma, 'bow_lemma')

<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3078888 stored elements in Compressed Sparse Row format>


## Evaluate

In [38]:
labeled_sampled_df = sample_df.dropna(subset=['label'])
cluster_cols = list(filter(lambda s: 'cluster__' in s, labeled_sampled_df.columns))
results = []
for col in cluster_cols:
    name = col.replace('cluster__','')
    results += max_cluster_metric(labeled_sampled_df.label, labeled_sampled_df[col], name)

results_df = pd.DataFrame(results)
print_result_df(results_df)

precision
--------------------------------------------------
name             bert       bow  bow_lemma
label                                     
cave_rescue  0.243902  0.252336   0.250000
duckboat     0.073171  0.168224   0.175926
helsinki     0.307692  0.579439   0.574074
mean         0.208255  0.333333   0.333333


recall
--------------------------------------------------
name             bert       bow  bow_lemma
label                                     
cave_rescue  0.322581  0.870968   0.870968
duckboat     0.142857  0.857143   0.904762
helsinki     0.090909  0.704545   0.704545
mean         0.185449  0.810885   0.826758


f_score
--------------------------------------------------
name             bert       bow  bow_lemma
label                                     
cave_rescue  0.277778  0.391304   0.388489
duckboat     0.096774  0.281250   0.294574
helsinki     0.140351  0.635897   0.632653
mean         0.171634  0.436151   0.438572




In [39]:
results_df.groupby('name').mean()

Unnamed: 0_level_0,precision,recall,f_score
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bert,0.208255,0.185449,0.171634
bow,0.333333,0.810885,0.436151
bow_lemma,0.333333,0.826758,0.438572
