In [1]:
%load_ext autoreload
%autoreload 2

In [24]:
import json
import community
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import gc

from IPython.display import HTML
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from collections import defaultdict

%matplotlib inline

In [3]:
from utils import read_article_df, load_label, load_entities, max_cluster_metric, print_result_df, merge_small_clusters

## Config

In [7]:
SAMPLE_SIZE = 10000
EDGE_PERCENTILE = 3 # 0 - 100

## Prepare Data

In [5]:
sample_df = pd.read_csv('../data/sample_articles.csv', index_col=0)

In [6]:
sample_df.head()

Unnamed: 0,pubId,canonicalUrl,firstScrape,title,text,lang_reliability,lang_iso,title_len,text_len,label
62,290,zerohedge.com/news/2018-07-19/fbi-chief-threat...,7/19/2018 8:26:52 AM -04:00,FBI Chief Threatens To Quit If Trump Invites R...,"by Knave Dave - Jul 18, 2018 1:11 pm ### This ...",1,en,78,2858,helsinki
10741,33,washingtonpost.com/news/morning-mix/wp/2018/07...,7/19/2018 11:51:57 PM -04:00,At least 8 reported dead as duck boat sinks ne...,At least 8 reported dead as duck boat sinks ne...,1,en,91,856,duckboat
10846,33,washingtonpost.com/news/posteverything/wp/2018...,7/19/2018 6:27:03 AM -04:00,"Ukraine’s not a country, Putin told Bush. What...",PostEverything Perspective ### Perspective Int...,1,en,79,8487,helsinki
13359,237,hotair.com/archives/2018/07/19/looking-glass-d...,7/19/2018 1:35:59 PM -04:00,"Through the looking glass: Democrats attack ""R...",Through the looking glass: Democrats attack “R...,1,en,86,4440,helsinki
16352,118,philly.com/philly/news/nation_world/20180719_a...,7/19/2018 11:15:42 PM -04:00,Sheriff: 8 people dead after Missouri tourist ...,Sheriff: 8 people dead after Missouri tourist ...,1,en,68,766,duckboat


## Clustering

### Generate Graph

In [10]:
def generate_graph(emb) -> nx.Graph:
    pct = np.percentile(euclidean_distances(emb), EDGE_PERCENTILE)
    nn = NearestNeighbors(radius=pct)
    nn.fit(emb)
    admat = nn.radius_neighbors_graph()
    print(admat.__repr__())
    G = nx.from_scipy_sparse_matrix(admat)
    return G

## Entity

In [25]:
entities = load_entities()
vect = CountVectorizer()
entity_words = [' '.join(entity) for entity in entities]
entities_tfidf = vect.fit_transform(entity_words)
sample_entities_tfidf = entities_tfidf[sample_df.index]

G = generate_graph(sample_entities_tfidf)
resolution = 1
partition = community.best_partition(G, resolution=resolution)
sample_df['cluster__entity_tfidf'] = pd.Series([v for k,v in partition.items()], index=sample_df.index)
merge_small_clusters(sample_df, 'cluster__entity_tfidf', threshold=5)

<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3344102 stored elements in Compressed Sparse Row format>


### Doc2Vec

In [23]:
doc2vec = pd.read_csv('../data/embedding/Doc2Vec_embedding_output.csv',index_col=0)
sample_doc2vec = doc2vec.loc[sample_df.index]
G_doc2vec = generate_graph(sample_doc2vec)
resolution = 1
partition_doc2vec = community.best_partition(G_doc2vec, resolution=resolution)
sample_df['cluster__doc2vec'] = pd.Series([v for k,v in partition_doc2vec.items()], index=sample_df.index)
merge_small_clusters(sample_df, 'cluster__doc2vec', threshold=5)

<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3074448 stored elements in Compressed Sparse Row format>


### BoW

In [22]:
bow = np.load('../data/embedding/BagOfWord_output.npy', allow_pickle=True)
bow = np.asmatrix(bow)[0,0]
sample_bow = bow[sample_df.index]
G_bow = generate_graph(sample_bow)
resolution = 1
partition_bow = community.best_partition(G_bow, resolution=resolution)
sample_df['cluster__bow'] = pd.Series([v for k,v in partition_bow.items()], index=sample_df.index)
merge_small_clusters(sample_df, 'cluster__bow', threshold=5)

<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3043224 stored elements in Compressed Sparse Row format>


## BERT spacy

In [8]:
import pickle
sample_bert = pickle.load(open('../data/embedding/sample_bert.p', 'rb'))

In [11]:
G_bert = generate_graph(sample_bert)
resolution = 1
partition_bert = community.best_partition(G_bert, resolution=resolution)
sample_df['cluster__bert_spacy'] = pd.Series([v for k,v in partition_bert.items()], index=sample_df.index)
merge_small_clusters(sample_df, 'cluster__bert_spacy', threshold=5)

<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3074448 stored elements in Compressed Sparse Row format>


In [15]:
bert_ = pd.read_csv('../data/embedding/BERT_EMBEDDING_withindex.csv', index_col=0)

In [None]:
bert_ = bert_.set_index('article_index')

In [19]:
sample_bert = bert_.loc[sample_df.index]
sample_bert = sample_bert.fillna(0)
G_bert = generate_graph(sample_bert)
resolution = 1
partition_bert = community.best_partition(G_bert, resolution=resolution)
sample_df['cluster__bert'] = pd.Series([v for k,v in partition_bert.items()], index=sample_df.index)
merge_small_clusters(sample_df, 'cluster__bert', threshold=5)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3074634 stored elements in Compressed Sparse Row format>


## BoW Lemmatize

In [39]:
def add_embedding(emb, name):
    sample_emb = emb[sample_df.index]
    G = generate_graph(sample_emb)
    resolution = 1
    partition = community.best_partition(G, resolution=resolution)
    sample_df[f'cluster__{name}'] = pd.Series([v for k,v in partition.items()], index=sample_df.index)
    merge_small_clusters(sample_df, f'cluster__{name}', threshold=5)
    
    
bow_lemma = np.load('../data/embedding/BagOfWord2_addlemmatization_output.npy', allow_pickle=True)
bow_lemma = np.asmatrix(bow_lemma)[0,0]
add_embedding(bow_lemma, 'bow_lemma')

<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3076592 stored elements in Compressed Sparse Row format>


## K-Mean

In [47]:
num_cluster = len(sample_df['cluster__bow'].unique())

In [None]:
kmean = KMeans(n_clusters=num_cluster, n_jobs=-1)
kmean.fit(sample_bow)

In [None]:
sample_df['cluster__kmean_bow'] = kmean.label_

## Evaluate

In [40]:
labeled_sampled_df = sample_df.dropna(subset=['label'])
cluster_cols = list(filter(lambda s: 'cluster__' in s, labeled_sampled_df.columns))
results = []
for col in cluster_cols:
    name = col.replace('cluster__','')
    results += max_cluster_metric(labeled_sampled_df.label, labeled_sampled_df[col], name)

results_df = pd.DataFrame(results)
print_result_df(results_df)

precision
--------------------------------------------------
name             bert     bert2       bow  bow_lemma   doc2vec  entity_tfidf
label                                                                       
cave_rescue  0.148148  0.222222  0.252252   0.252252  0.252525      0.252427
duckboat     0.074074  0.185185  0.153153   0.153153  0.161616      0.165049
helsinki     0.464286  0.592593  0.594595   0.594595  0.585859      0.582524
mean         0.228836  0.333333  0.333333   0.333333  0.333333      0.333333


recall
--------------------------------------------------
name             bert     bert2       bow  bow_lemma   doc2vec  entity_tfidf
label                                                                       
cave_rescue  0.258065  0.387097  0.903226   0.903226  0.806452      0.838710
duckboat     0.190476  0.476190  0.809524   0.809524  0.761905      0.809524
helsinki     0.295455  0.363636  0.750000   0.750000  0.659091      0.681818
mean         0.247998  0.408975 

In [27]:
results_df.groupby('name').mean()

Unnamed: 0_level_0,precision,recall,f_score
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bert,0.228836,0.247998,0.218671
bert2,0.333333,0.408975,0.333241
bow,0.333333,0.820917,0.43842
doc2vec,0.333333,0.742482,0.423868
entity_tfidf,0.333333,0.776684,0.430175


In [37]:
results_df

Unnamed: 0,label,precision,recall,f_score,name
0,duckboat,0.074074,0.190476,0.106667,bert_spacy
1,helsinki,0.464286,0.295455,0.361111,bert_spacy
2,cave_rescue,0.148148,0.258065,0.188235,bert_spacy
3,duckboat,0.185185,0.47619,0.266667,bert
4,helsinki,0.592593,0.363636,0.450704,bert
5,cave_rescue,0.222222,0.387097,0.282353,bert
6,duckboat,0.153153,0.809524,0.257576,bow
7,helsinki,0.594595,0.75,0.663317,bow
8,cave_rescue,0.252252,0.903226,0.394366,bow
9,duckboat,0.161616,0.761905,0.266667,doc2vec


In [56]:
results_df.to_csv('../data/clustering/progress_report2_metrics.csv')

In [57]:
sample_df.to_csv('../data/clustering/progress_report2_assignment.csv')