Cluster_Analysis:
* From what i have seen here, a lot of unrelated documents are getting sampled from our overall dataset , increasing noise
* Might have to run the relevance classifier to filter out non-relevant documents and then sample from the resulting set of articles
* Below we can see the article clusters and the topics they discuss, we see a lot of clusters have non-related articles (like general crime, polls , etcc)
* This might make it very hard to try to identify terms that are indicative of political stance

In [1]:
from general_utils import timer

from config import RANDOM_SEED

from preprocess_utils import preprocess_texts, tfidf_vectorization, dimensionality_reduction

from clustering_utils import run_clustering, get_cluster_sizes, score_cluster, get_cluster_pairs, get_pairwise_dist, cluster2doc, filter_clusters, get_top_100_clusterpairs

from data_utils import load_data, sample_data, balanced_sampling, create_train_test

from collections import Counter, defaultdict

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegressionCV

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

....... Initializing Settings ..... 
Random_Seed Chosen : 15112


In [40]:
def get_top_words(docs):
    """
    """
    stopwords_list = stopwords.words('english')
    counts = Counter()
    
    for d in docs:
        counts.update([w for w in word_tokenize(d) if w not in stopwords_list])
    
    return counts

def get_top_words_by_stance(filtered_clusters,cluster_2_doc,sampled_df,top=50):
    """
    """
    all_docs_clustered=[]
    for cp in filtered_clusters:
        all_docs_clustered += [sampled_df["processed_text"].iloc[d] for d in cluster_2_doc[cp[0]]]
        all_docs_clustered += [sampled_df["processed_text"].iloc[d] for d in cluster_2_doc[cp[1]]]
    
    counts = get_top_words(all_docs_clustered)
    print(counts.most_common(top))
    

def get_docs_by_stance(cluster,cluster_2_doc_map,sampled_df):
    """
    """
    documents = cluster_2_doc_map[cluster]
    conservative_docs = [d for d in documents if sampled_df["binary_ps"].iloc[d] == 1]
    liberal_docs = [d for d in documents if sampled_df["binary_ps"].iloc[d] == 0]
    
    print("Conservative Documents - Random 5 :\n")
    for i in conservative_docs[:5]:
        print(sampled_df["title"].iloc[i] + "\n" +sampled_df["text"].iloc[i]+"\n")
    
    print("Liberal Documents - Random 5 :\n")
    for i in liberal_docs[:5]:
        print(sampled_df["title"].iloc[i] + "\n" +sampled_df["text"].iloc[i]+"\n")
    
    print("\nTop Words in Conservative Documents : \n")
    print(get_top_words([sampled_df["processed_text"].iloc[i] for i in conservative_docs]).most_common(30))
    
    print("\nTop Words in Liberal Documents : \n")
    print(get_top_words([sampled_df["processed_text"].iloc[i] for i in liberal_docs]).most_common(30))
    
    print("\nDifference between Top 500 Words in both Stance Categories :")
    conserv_top_100 = [w[0] for w in get_top_words([sampled_df["processed_text"].iloc[i] for i in conservative_docs]).most_common(500)]
    liberal_top_100 = [w[0] for w in get_top_words([sampled_df["processed_text"].iloc[i] for i in liberal_docs]).most_common(500)]
    print("\nTop Words in Conservative Stance Documents only :")
    print(set(conserv_top_100) - (set(liberal_top_100)))
    print("\nTop Words in Liberal Stance Documents only :")
    print(set(liberal_top_100) - (set(conserv_top_100)))

def get_overlapping_cluster_terms(cluster_pairs,cluster_2_doc_map,sampled_df,overlap_coef=3):
    """
    """
    all_cluster1, all_cluster2 = zip(*cluster_pairs)
    
    all_clusters = list(set(all_cluster1 + all_cluster2))
    
    all_related_docs = []
    
    cluster_top_terms_map = defaultdict(lambda : defaultdict(list))
    
    for c in all_clusters:
        cluster_docs = cluster_2_doc_map[c]
        conserv_docs = [d for d in cluster_docs if sampled_df["binary_ps"].iloc[d] == 1]
        liberal_docs = [d for d in cluster_docs if sampled_df["binary_ps"].iloc[d]==0]
        
        top_conserv_100 = [x[0] for x in get_top_words([sampled_df["processed_text"].iloc[i] for i in conserv_docs]).most_common(1000)]
        top_liberal_100 = [x[0] for x in get_top_words([sampled_df["processed_text"].iloc[i] for i in liberal_docs]).most_common(1000)]
        
        cluster_top_terms_map[c]["conservative"] = top_conserv_100
        cluster_top_terms_map[c]["liberal"] = top_liberal_100
    
    conserv_counter = Counter()
    liberal_counter = Counter()
    
    for c in all_clusters:
        conserv_counter.update(cluster_top_terms_map[c]["conservative"])
        liberal_counter.update(cluster_top_terms_map[c]["liberal"])
    
    top_1000_conserv = conserv_counter.most_common(1000)
    top_1000_liberal = liberal_counter.most_common(1000)
    
    conserv_only = sorted(list(set(top_1000_conserv) - set(top_1000_liberal)),key=lambda x: x[1],reverse=True)
    liberal_only = sorted(list(set(top_1000_liberal) - set(top_1000_conserv)),key=lambda x: x[1], reverse=True)
    
    print("Top Overlapping Conservative Only Terms : \n")
    print(conserv_only)
    print("Top Overlapping Liberal Only Terms : \n")
    print(liberal_only)
        
    liberal_df = pd.DataFrame(liberal_only,columns=["Word","Cluster_Overlap"])
    conserv_df = pd.DataFrame(conserv_only,columns=["Word","Cluster_Overlap"])
    
    liberal_df.to_csv("Liberal_Word_Overlap_btw_clusters.csv",index=False)
    conserv_df.to_csv("Conserv_Word_Overlap_btw_clusters.csv",index=False)
    

In [22]:
# RANDOM_SEED = 2345902324

In [23]:
path = "../articles.csv"

main_df = load_data(path)

sampled_df = sample_data(df=main_df,sample_size=100000,seed=RANDOM_SEED)
print("Sampled Size: %s" %str(sampled_df.shape[0]))

sampled_df["processed_text"] = preprocess_texts(text_lists=sampled_df["text"])

vectors,vocab,tfidf_vectorizer = tfidf_vectorization(df=sampled_df,min_df=50,max_df=0.75,seed=RANDOM_SEED)

# reduced_vectors = dimensionality_reduction(vectors=vectors,mode="SVD_LSA",dim=500,seed=RANDOM_SEED)
reduced_vectors = vectors

clusters,cluster_clf = run_clustering(vectors=reduced_vectors,seed=RANDOM_SEED,num_clusters=100,clus_type="kmeans")

cluster_sizes = get_cluster_sizes(cluster_clf)

# cluster_pair_dist_mat = get_pairwise_dist(cluster_clf,dist_type="cosine")

cluster_pairs = get_cluster_pairs(num_clusters=100)
print(len(cluster_pairs))

Index(['article_id', 'url', 'title', 'text', 'source', 'source_partisan_score',
       'tweet_id', 'tweet_screen_name', 'tweet_created_at', 'tweet_text'],
      dtype='object')
Df original shape : (921037, 10)
Df shape after dropping nan text : (919430, 10)
Df shape after dropping duplicate articles based on title : (912084, 10)
Df shape after dropping 0 stance articles : (630425, 10)

Finished running 'load_data' in 0.3871 mins


Finished running 'sample_data' in 0.0142 mins

Sampled Size: 100000
Running : select_first10
Running : to_lower
Running : remove_punc
Running : remove_small_words
Running : remove_spaces

Finished running 'preprocess_texts' in 0.2888 mins

vocab_size : 16792

Finished running 'tfidf_vectorization' in 0.1504 mins


Running KMEANS Clustering with k=100

Finished running 'run_clustering' in 0.1059 mins


Finished running 'get_cluster_sizes' in 0.0002 mins


Finished running 'get_pairwise_dist' in 0.0001 mins


Number of Cluster Pairs : 4950

Finished running 'ge

In [42]:
cluster_sizes

Counter({25: 12677,
         13: 573,
         95: 946,
         96: 854,
         78: 1295,
         14: 883,
         24: 4495,
         66: 263,
         71: 2317,
         2: 576,
         4: 1233,
         59: 1362,
         67: 645,
         84: 1171,
         30: 1578,
         7: 1493,
         52: 843,
         31: 385,
         94: 736,
         50: 13200,
         54: 1531,
         77: 3266,
         86: 870,
         27: 1178,
         64: 722,
         53: 1926,
         72: 570,
         46: 623,
         0: 589,
         91: 330,
         62: 2122,
         21: 929,
         90: 715,
         43: 328,
         68: 1887,
         87: 1006,
         99: 587,
         88: 1223,
         39: 360,
         70: 411,
         33: 694,
         81: 694,
         9: 516,
         3: 284,
         75: 345,
         20: 583,
         17: 2705,
         82: 235,
         22: 583,
         51: 1010,
         1: 762,
         79: 349,
         18: 1437,
         19: 1141,
         49

In [24]:
doc_2_cluster_map = cluster2doc(num_texts=sampled_df.shape[0],cluster_labels=cluster_clf.labels_)


Finished running 'cluster2doc' in 0.0004 mins



In [25]:
filtered_cluster_pairs = filter_clusters(cluster_pairs=cluster_pairs,
                                        doc_2_cluster_map=doc_2_cluster_map,
                                        cluster_sizes=cluster_sizes,
                                        partisan_scores=sampled_df["binary_ps"].tolist(),
                                        min_size=200,
                                        max_size=5000,
                                        min_partisan_size=0.4)

print("Filtered CLustered Pairs : %s" %str(len(filtered_cluster_pairs)))


Finished running 'filter_clusters' in 0.0151 mins

Filtered CLustered Pairs : 903


In [26]:
# Check 5 cluster pairs
print(filtered_cluster_pairs[:5])

[(0, 1), (0, 5), (0, 7), (0, 13), (0, 14)]


In [27]:
clusters = []
for cp in filtered_cluster_pairs:
    clusters.append(cp[0])
    clusters.append(cp[1])

print(set(clusters))
print(len(set(clusters)))

{0, 1, 5, 7, 13, 14, 16, 17, 18, 20, 22, 23, 30, 31, 33, 35, 38, 40, 41, 43, 44, 48, 54, 55, 59, 62, 63, 64, 66, 69, 71, 75, 76, 77, 82, 83, 88, 89, 92, 95, 96, 97, 99}
43


In [41]:
get_overlapping_cluster_terms(cluster_pairs=filtered_cluster_pairs,
                              cluster_2_doc_map=doc_2_cluster_map,
                              sampled_df=sampled_df,
                              overlap_coef=3)

Top Overlapping Conservative Only Terms : 

[('today', 43), ('help', 43), ('believe', 43), ('office', 43), ('around', 43), ('past', 43), ('show', 43), ('following', 43), ('four', 43), ('years', 43), ('think', 43), ('reported', 43), ('support', 43), ('taking', 43), ('latest', 43), ('use', 43), ('million', 43), ('using', 43), ('daily', 43), ('media', 43), ('however', 43), ('set', 42), ('washington', 42), ('person', 42), ('based', 42), ('fox', 42), ('open', 42), ('order', 42), ('decision', 42), ('whether', 42), ('began', 42), ('little', 42), ('trying', 42), ('seen', 42), ('continued', 42), ('ago', 42), ('number', 42), ('others', 42), ('american', 42), ('government', 42), ('chief', 42), ('old', 42), ('behind', 42), ('white', 42), ('000', 42), ('move', 42), ('much', 42), ('given', 42), ('clear', 42), ('high', 42), ('got', 42), ('top', 42), ('member', 41), ('general', 41), ('claimed', 41), ('stop', 41), ('give', 41), ('led', 41), ('related', 41), ('social', 41), ('twitter', 41), ('found', 41

In [28]:
get_docs_by_stance(cluster=0,
                   cluster_2_doc_map=doc_2_cluster_map,
                   sampled_df=sampled_df)

Conservative Documents - Random 5 :

Oklahoma couple accused of murdering teen as ‘payback’ for STD
A twisted couple in Oklahoma has been charged with murdering a 17-year-old girl as “payback” over a sexually transmitted disease, court documents show.          Andrew Hall, 30, and Cheyenne Blalock, 17, were charged Friday with first-degree murder in the death of Kirstan Patterson, who was found shot in the head Wednesday just hours after she was reported missing, KOTV reported.          “Be very careful who you hang out with and who you run around [with], because it can make a difference,” Mayes County Sheriff Mike Reed told the station. “I promise you your parents would rather get up at 1 in the morning and come and give you a ride home than something detrimental happen to you.”          Blalock told investigators that Hall had discussed killing Patterson as “payback” for a sexually transmitted disease and told her to text the teen to meet up with them on New Year’s Day, court documen

In [29]:
get_docs_by_stance(cluster=1,
                   cluster_2_doc_map=doc_2_cluster_map,
                   sampled_df=sampled_df)

Conservative Documents - Random 5 :

Safety commission has dubbed crib bumpers a suffocation hazard
A common adornment of the well-apportioned baby crib — protective cloth “bumpers” — have been linked to dozens of accidental infant suffocation deaths in recent decades, a new investigation revealed.          Padded crib bumpers have been implicated by medical examiners in at least 35 baby deaths across the country, the Washington Post reported.          The National Institutes of Health, American Academy of Pediatrics and the Centers for Disease Control and Prevention all have warned the public against using bumpers for years, the paper reported.          Still, the US Consumer Product Safety Commission has failed, until now, to rule that padded bumpers were to blame in the deaths.          The commission has a new acting chairman — longtime commissioner Robert Adler — who told the paper he plans to invite outside experts to come in and debate the safety of crib bumpers at a public hear

In [30]:
get_docs_by_stance(cluster=7,
                   cluster_2_doc_map=doc_2_cluster_map,
                   sampled_df=sampled_df)

Conservative Documents - Random 5 :

FBI Discovers Homegrown Islamic Terror Compound In Alabama
The FBI has uncovered a homegrown, jihadist compound in Macon County, Alabama.          The FBI‘s search warrant described the property as a “makeshift military-style obstacle course” in a story first reported by Sinclair Broadcast Group. The land where the group gathered reportedly looked like an “abandoned dump,” and was led by Siraj Wahhaj, who allegedly trained children to commit school shootings in a similar terrorist breeding ground in New Mexico last year. (RELATED: After Dropped Charges, FBI Re-Arrests Five ‘Extremist Muslim’ New Mexico Compound Suspects)          Wahhaj and four other alleged Islamic extremists were indicted on terrorism, kidnapping, and firearm violation charges earlier this year. (RELATED: Convicted Islamic Terrorist Vallmoe Shqaire Held US Citizenship, Lived In Country For Years)                     In an interview with Sinclair Broadcast Group, former FBI agent 

In [31]:
get_docs_by_stance(cluster=14,
                   cluster_2_doc_map=doc_2_cluster_map,
                   sampled_df=sampled_df)

Conservative Documents - Random 5 :

Bernie Sanders praised segregationist George Wallace as 'sensitive' in 1972
Seven years after Martin Luther King, Jr. referred to George Wallace as "perhaps the most dangerous racist in America today," a young Bernie Sanders praised the segregationist Alabama governor.          In an interview with the Brattleboro Reformer in 1972, Sanders, then 31, said Wallace "advocates some outrageous approaches to our problems, but at least he is sensitive to what people feel they need."          Sanders, now a Vermont senator and 2020 Democrat, said, "What we need are more active politicians working for the people."          The 1972 remarks surprised the interviewer at the time, who wrote that "even though [Sanders] has been labeled a 'leftist radical' by some persons, Sanders had some praise for [Wallace]."          On other occasions, Sanders was more critical of Wallace and warned about the allure of white identity politics.          At the time, Sanders w

In [32]:
get_docs_by_stance(cluster=30,
                   cluster_2_doc_map=doc_2_cluster_map,
                   sampled_df=sampled_df)

Conservative Documents - Random 5 :

Elizabeth Warren says Biden’s response to Tara Reade claims were ‘credible and convincing’
Sen. Elizabeth Warren on Monday said that Joe Biden’s response to the sexual assault allegation levied against him by a former aide was “credible and convincing,” according to reports.          “I saw the reports of what Ms. Reade said, I saw an interview with Vice President Biden. I appreciate that the vice president took a lot of questions, tough questions. And he answered them directly and respectfully,” Warren said, according to Fox News and a CNN reporter.          “The vice president’s answers were credible and convincing,” she added.          The Massachusetts lawmaker was referring to Biden’s interview Friday on MSNBC’s “Morning Joe,” where he made his first public statement on Tara Reade’s allegation that he sexually assaulted her in a Senate hallway in 1993 when she worked for him.          “It is not true. I’m saying it unequivocally – it never happ