In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import community
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import gc

from IPython.display import HTML
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from collections import defaultdict

%matplotlib inline

In [3]:
from utils import read_article_df, load_label, load_entities, max_cluster_metric, print_result_df, merge_small_clusters

## Config

In [42]:
SAMPLE_SIZE = 10000
MIN_EDGE_PCT = 3
MAX_EDGE_PCT = 6 # 0 - 100

## Prepare Data

In [5]:
sample_df = pd.read_csv('../data/sample_articles.csv', index_col=0)

In [6]:
sample_df.head()

Unnamed: 0,pubId,canonicalUrl,firstScrape,title,text,lang_reliability,lang_iso,title_len,text_len,label
62,290,zerohedge.com/news/2018-07-19/fbi-chief-threat...,7/19/2018 8:26:52 AM -04:00,FBI Chief Threatens To Quit If Trump Invites R...,"by Knave Dave - Jul 18, 2018 1:11 pm ### This ...",1,en,78,2858,helsinki
10741,33,washingtonpost.com/news/morning-mix/wp/2018/07...,7/19/2018 11:51:57 PM -04:00,At least 8 reported dead as duck boat sinks ne...,At least 8 reported dead as duck boat sinks ne...,1,en,91,856,duckboat
10846,33,washingtonpost.com/news/posteverything/wp/2018...,7/19/2018 6:27:03 AM -04:00,"Ukraine’s not a country, Putin told Bush. What...",PostEverything Perspective ### Perspective Int...,1,en,79,8487,helsinki
13359,237,hotair.com/archives/2018/07/19/looking-glass-d...,7/19/2018 1:35:59 PM -04:00,"Through the looking glass: Democrats attack ""R...",Through the looking glass: Democrats attack “R...,1,en,86,4440,helsinki
16352,118,philly.com/philly/news/nation_world/20180719_a...,7/19/2018 11:15:42 PM -04:00,Sheriff: 8 people dead after Missouri tourist ...,Sheriff: 8 people dead after Missouri tourist ...,1,en,68,766,duckboat


## Clustering

### Generate Graph

In [49]:
def generate_graph(emb, min_edge_pct = MIN_EDGE_PCT, max_edge_pct = MAX_EDGE_PCT) -> nx.Graph:
    min_distance = np.percentile(euclidean_distances(emb), min_edge_pct)
    max_distance = np.percentile(euclidean_distances(emb), max_edge_pct)
    min_nn = NearestNeighbors(radius=min_distance)
    min_nn.fit(emb)
    min_admat = min_nn.radius_neighbors_graph()
    
    max_nn = NearestNeighbors(radius=max_distance)
    max_nn.fit(emb)
    max_admat = max_nn.radius_neighbors_graph()
    
    # remove below min edge
    max_admat[min_admat.astype('bool')] = 0
    max_admat.eliminate_zeros()
    print(max_admat.__repr__())
    
    G = nx.from_scipy_sparse_matrix(max_admat)
    return G

## Entity

In [55]:
entities = load_entities()

# for i in range(3, 30, 3):
vect = CountVectorizer()
entity_words = [' '.join(entity) for entity in entities]
entities_tfidf = vect.fit_transform(entity_words)
sample_entities_tfidf = entities_tfidf[sample_df.index]
min_edge_pct = 27
max_edge_pct = 30
G = generate_graph(sample_entities_tfidf, min_edge_pct=min_edge_pct, max_edge_pct=max_edge_pct)
resolution = 1
partition = community.best_partition(G, resolution=resolution)
cname = f'cluster__entity_count_{min_edge_pct}to{max_edge_pct}'
sample_df[cname] = pd.Series([v for k,v in partition.items()], index=sample_df.index)
merge_small_clusters(sample_df, cname, threshold=5)

<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 15144270 stored elements in Compressed Sparse Row format>


### Doc2Vec

In [59]:
doc2vec = pd.read_csv('../data/embedding/Doc2Vec_embedding_output.csv',index_col=0)

for i in range(3, 30, 3):
    sample_doc2vec = doc2vec.loc[sample_df.index]
    
    resolution = 1
    min_edge_pct = 27
    max_edge_pct = 30
    G = generate_graph(sample_doc2vec, min_edge_pct=min_edge_pct, max_edge_pct=max_edge_pct)
    partition_doc2vec = community.best_partition(G_doc2vec, resolution=resolution)
    cname = f'cluster__doc2vec_{min_edge_pct}to{max_edge_pct}'
    sample_df['cluster__doc2vec'] = pd.Series([v for k,v in partition_doc2vec.items()], index=sample_df.index)
    merge_small_clusters(sample_df, 'cluster__doc2vec', threshold=5)

<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3084588 stored elements in Compressed Sparse Row format>
<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3084588 stored elements in Compressed Sparse Row format>
<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3084588 stored elements in Compressed Sparse Row format>
<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3084588 stored elements in Compressed Sparse Row format>


KeyboardInterrupt: 

### BoW

In [10]:
bow = np.load('../data/embedding/BagOfWord_output.npy', allow_pickle=True)
bow = np.asmatrix(bow)[0,0]
sample_bow = bow[sample_df.index]
G_bow = generate_graph(sample_bow)
resolution = 1
partition_bow = community.best_partition(G_bow, resolution=resolution)
sample_df['cluster__bow'] = pd.Series([v for k,v in partition_bow.items()], index=sample_df.index)
merge_small_clusters(sample_df, 'cluster__bow', threshold=5)

<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3043224 stored elements in Compressed Sparse Row format>


## BERT spacy

In [34]:
sample_bert_arm = pd.read_csv('../data/sample_bert.csv')
G_bert = generate_graph(sample_bert_arm)
resolution = 1
partition_bert = community.best_partition(G_bert, resolution=resolution)
sample_df['cluster__bert_clean_spacy'] = pd.Series([v for k,v in partition_bert.items()], index=sample_df.index)
merge_small_clusters(sample_df, 'cluster__bert_clean_spacy', threshold=5)

<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3074448 stored elements in Compressed Sparse Row format>


In [11]:
# import pickle
# sample_bert = pickle.load(open('../data/embedding/sample_bert.p', 'rb'))

In [12]:
# G_bert = generate_graph(sample_bert)
# resolution = 1
# partition_bert = community.best_partition(G_bert, resolution=resolution)
# sample_df['cluster__bert_spacy'] = pd.Series([v for k,v in partition_bert.items()], index=sample_df.index)
# merge_small_clusters(sample_df, 'cluster__bert_spacy', threshold=5)

<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3074448 stored elements in Compressed Sparse Row format>


In [13]:
bert_ = pd.read_csv('../data/embedding/BERT_EMBEDDING_withindex.csv', index_col=0)

In [14]:
bert_ = bert_.set_index('article_index')

In [15]:
sample_bert = bert_.loc[sample_df.index]
sample_bert = sample_bert.fillna(0)
G_bert = generate_graph(sample_bert)
resolution = 1
partition_bert = community.best_partition(G_bert, resolution=resolution)
sample_df['cluster__bert'] = pd.Series([v for k,v in partition_bert.items()], index=sample_df.index)
merge_small_clusters(sample_df, 'cluster__bert', threshold=5)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3074634 stored elements in Compressed Sparse Row format>


## BoW Lemmatize

In [16]:
# def add_embedding(emb, name):
#     sample_emb = emb[sample_df.index]
#     G = generate_graph(sample_emb)
#     resolution = 1
#     partition = community.best_partition(G, resolution=resolution)
#     sample_df[f'cluster__{name}'] = pd.Series([v for k,v in partition.items()], index=sample_df.index)
#     merge_small_clusters(sample_df, f'cluster__{name}', threshold=5)
    
    
# bow_lemma = np.load('../data/embedding/BagOfWord2_addlemmatization_output.npy', allow_pickle=True)
# bow_lemma = np.asmatrix(bow_lemma)[0,0]
# add_embedding(bow_lemma, 'bow_lemma')

<10140x10140 sparse matrix of type '<class 'numpy.float64'>'
	with 3076592 stored elements in Compressed Sparse Row format>


## K-Mean

In [None]:
# num_cluster = len(sample_df['cluster__bow'].unique())
# kmean = KMeans(n_clusters=num_cluster, n_jobs=7)
# kmean.fit(sample_bow)
# kmean.labels_.shape
# sample_df['cluster__kmean_bow'] = kmean.labels_

## Fasttext

In [17]:
fasttext = pd.read_csv('FASTTEXT_EMBEDDING_withindex.csv',index_col=0)
sample_fasttext = fasttext.loc[sample_df.index]
G_fasttext = generate_graph(sample_fasttext)
resolution = 1
partition_fasttext = community.best_partition(G_fasttext, resolution=resolution)
sample_df['cluster__fasttext'] = pd.Series([v for k,v in partition_fasttext.items()], index=sample_df.index)
merge_small_clusters(sample_df, 'cluster__fasttext', threshold=5)

## Evaluate

In [67]:
labeled_sampled_df = sample_df.dropna(subset=['label'])
cluster_cols = list(filter(lambda s: 'cluster__' in s, labeled_sampled_df.columns))
results = []
for col in cluster_cols:
    results += max_cluster_metric(labeled_sampled_df, col)

results_df = pd.DataFrame(results)
print_result_df(results_df)

precision
--------------------------------------------------
name         cluster__bert  cluster__bert_clean_spacy  cluster__bert_spacy  \
label                                                                        
cave_rescue       0.222222                   0.338710             0.309091   
duckboat          0.185185                   0.500000             0.236364   
helsinki          0.592593                   0.629032             0.454545   
mean              0.333333                   0.489247             0.333333   

name         cluster__bow  cluster__bow_lemma  cluster__doc2vec  \
label                                                             
cave_rescue      0.500000            0.500000              1.00   
duckboat         0.100000            0.133333              0.10   
helsinki         0.857143            0.800000              0.85   
mean             0.485714            0.477778              0.65   

name         cluster__entity_tfidf  cluster__entity_tfidf_12:15  \


In [68]:
results_df.groupby('name').mean()

Unnamed: 0_level_0,precision,recall,f_score
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cluster__bert,0.333333,0.408975,0.333241
cluster__bert_clean_spacy,0.489247,0.468772,0.445083
cluster__bert_spacy,0.333333,0.483842,0.362368
cluster__bow,0.485714,0.07208,0.120139
cluster__bow_lemma,0.477778,0.087953,0.134909
cluster__doc2vec,0.65,0.106893,0.158292
cluster__entity_tfidf,0.353333,0.151847,0.200091
cluster__entity_tfidf_12:15,0.414286,0.219528,0.268035
cluster__entity_tfidf_15:18,0.401464,0.253119,0.297384
cluster__entity_tfidf_18:21,0.436733,0.232224,0.296967


In [40]:
results_df.to_csv('../data/clustering/progress_report2_metrics.csv')

In [69]:
sample_df.to_csv('../data/clustering/progress_report2_assignment_backup.csv')

In [58]:
i

27