To Find:
* The topics each cluster discusses
* Top Shared Terms between Topics
* Top Topic Specific Terms 

In [24]:
from attm_data_utils import get_train_test_attm
import random
import pandas as pd
from attm_utils import load_pickle
from collections import Counter

In [5]:
random.seed(42)

In [6]:
cps = load_pickle('att_pickle_objs_input/top100.pickle')
data = load_pickle("att_pickle_objs_input/clean_df.pickle")
doc_2_cluster_map = load_pickle('att_pickle_objs_input/d2c_map.pickle')

In [29]:
def check_cluster_pair(df,cp,doc_2_cluster_map,num_articles=5):
    """
    Print 5 Documents from each cluster
    """
    c1_df, c2_df = get_train_test_attm(df,cp,doc_2_cluster_map,neg_sample_size=3,single_task=True)
    c1_df["processed_all"] = c1_df["processed_title"] + "\n" + c1_df["processed_text"]
    c2_df["processed_all"] = c2_df["processed_title"] + "\n" + c2_df["processed_text"]
    print(c1_df.columns)
    print(c2_df.columns)
    
    print("\nRandom %s Articles from Cluster 1 : \n"%str(num_articles))
    c1_5_arts = random.sample([a for a in zip(c1_df["title"].tolist(),c1_df["text"].tolist())],num_articles)
    for art in c1_5_arts:
        print(art[0])
        print("\n")
        print(art[1])
        print("\n\n")
    
    print("\nRandom %s Articles from Cluster 2 : \n"%str(num_articles))
    c2_5_arts = random.sample([a for a in zip(c2_df["title"].tolist(),c2_df["text"].tolist())],num_articles)
    for art in c2_5_arts:
        print(art[0])
        print("\n")
        print(art[1])
        print("\n\n")

def get_terms(text_list):
    
    token_counts = Counter()
    for text in text_list:
        tokens = text.split(" ")
        token_counts.update(tokens)
    return token_counts

def get_top_terms(df,cp,doc_2_cluster_map):
    """
    Top Terms per cluster that are not shared and that are shared between clusters
    """
    c1_df, c2_df = get_train_test_attm(df,cp,doc_2_cluster_map,neg_sample_size=3,single_task=True)
    c1_df["processed_all"] = c1_df["processed_title"] + " " + c1_df["processed_text"]
    c2_df["processed_all"] = c2_df["processed_title"] + " " + c2_df["processed_text"]
    
    c1_terms = get_terms(text_list=c1_df["processed_all"].tolist())
    c2_terms = get_terms(text_list=c2_df["processed_all"].tolist())
    
    overlap_terms = set(c1_terms.keys()).intersection(c2_terms.keys())
    
    c1_specific_terms = {}
    c2_specific_terms = {}
    
    for term in c1_terms.keys():
        if term not in overlap_terms:
            c1_specific_terms[term] =c1_terms[term]
    
    for term in c2_terms.keys():
        if term not in overlap_terms:
            c2_specific_terms[term] =c2_terms[term]
    
    overlap_terms_freq_tups = [(a,c1_terms[a]+c2_terms[a]) for a in overlap_terms]
    c1_specific_terms_freq_tups = [a for a in c1_specific_terms.items()]
    c2_specific_terms_freq_tups = [a for a in c2_specific_terms.items()]
    
    print("\nTop 50 Overlapping Terms : \n %s"%str(sorted(overlap_terms_freq_tups,key=lambda x:x[1],reverse=True)[:50]))
    print("\nTop 50 Terms Specific in C1 : \n%s"%str(sorted(c1_specific_terms_freq_tups,key=lambda x:x[1],reverse=True)[:50]))
    print("\nTop 50 Terms Specific in C2 : \n%s"%str(sorted(c2_specific_terms_freq_tups,key=lambda x:x[1],reverse=True)[:50]))
    
    

In [23]:
check_cluster_pair(df=data,cp=(14,44),doc_2_cluster_map=doc_2_cluster_map,num_articles=5)

Original Train Shape : (1394, 9)
Original Test Shape : (1189, 9)
Index(['title', 'text', 'source_partisan_score', 'binary_ps', 'processed_text',
       'processed_title', 'context_word_pos', 'context_word_neg', 'drop_',
       'processed_all'],
      dtype='object')
Index(['title', 'text', 'source_partisan_score', 'binary_ps', 'processed_text',
       'processed_title', 'context_word_pos', 'context_word_neg', 'drop_',
       'processed_all'],
      dtype='object')

Random 5 Articles from Cluster 1 : 

Judge dismisses suit claiming Trump collusion with Russian hackers, WikiLeaks in DNC breach


A federal judge in D.C. has dismissed a lawsuit alleging President Trump’s election campaign conspired with Russian hackers and WikiLeaks to publish stolen Democratic National Committee documents during the 2016 race.          U.S. District Judge Ellen Segal Huvelle ruled Tuesday in favor of tossing out the invasion of privacy lawsuit over lack of jurisdiction, writing in a 45-page opinion that t

In [30]:
get_top_terms(df=data,cp=(14,44),doc_2_cluster_map=doc_2_cluster_map)

Original Train Shape : (1394, 9)
Original Test Shape : (1189, 9)

Top 50 Overlapping Terms : 
 [('the', 51898), ('and', 20035), ('that', 12846), ('trump', 8466), ('for', 8082), ('with', 6118), ('his', 5676), ('was', 5611), ('president', 5405), ('said', 4368), ('from', 4073), ('has', 4006), ('who', 3545), ('have', 3443), ('house', 3283), ('not', 3253), ('', 3173), ('this', 2884), ('about', 2880), ('are', 2654), ('campaign', 2617), ('biden', 2491), ('but', 2316), ('her', 2281), ('they', 2155), ('had', 2154), ('former', 2127), ('she', 2118), ('impeachment', 2023), ('their', 1952), ('after', 1939), ('which', 1899), ('democrats', 1845), ('been', 1805), ('new', 1804), ('would', 1776), ('were', 1739), ('white', 1637), ('more', 1612), ('democratic', 1582), ('one', 1510), ('into', 1487), ('also', 1480), ('will', 1463), ('committee', 1463), ('over', 1425), ('against', 1405), ('out', 1403), ('donald', 1400), ('election', 1318)]

Top 50 Terms Specific in C1 : 
[('slotkin', 19), ('ratcliffe', 19), 