## AIM:
------
We want to test how similar the vector representations for words that identify a stance in general (across clusters/topics) are compared to words that identify stance for a given topic (single cluster).  

And we want to see how this varies for different layers of bert from which the vector rrepresentation was retrieved

In [2]:
# HAVE TO USE TF_IDF TO identify the keywords since we want to use classifiers to identify which words are indicative of stance, then pass these keywords through bert to get their representations

* Get our Dataframe
* Sample
* Get representations
* Do clustering
* Training Classifiers for each cluster and getting top feats for each stance
* See how many overlap and how many are distinct
* use large thresholds to consider top feats , eg: 100,200,500

# Table of Contents
1. [Vectorization and Clustering](#Vectorization-and-Clustering)
2. [Manual Content Analysis - Observing Words with High Frequency](#Manual-Content-Analysis---Observing-Words-with-High-Frequency)
    1. [Cluster 70 - Trump Impeachment](#Cluster-70---Trump-Impeachment)
    2. [Cluster 272 - US-MILITARY (IRAQ Situation)](#Cluster-272---US-MILITARY-(IRAQ-Situation))
    3. [Cluster 305 - US China Relations (Trade, covid 19 etc ..)](#Cluster-305---US-China-Relations-(Trade,-covid-19-etc-..))
    4. [Top Words By Stance Across All Clusters](#Top-Words-By-Stance-Across-All-Clusters)
3. [Identifying Top Words indicating stance using Classifiers](#Identifying-Top-Words-indicating-stance-using-Classifiers)

In [3]:
from general_utils import timer

from config import RANDOM_SEED

from preprocess_utils import preprocess_texts, tfidf_vectorization, dimensionality_reduction

from clustering_utils import run_clustering, get_cluster_sizes, score_cluster, get_cluster_pairs, get_pairwise_dist, cluster2doc, filter_clusters, get_top_100_clusterpairs

from data_utils import load_data, sample_data, balanced_sampling, create_train_test

from collections import Counter, defaultdict

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegressionCV

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

....... Initializing Settings ..... 
Random_Seed Chosen : 15112


In [4]:
def get_top_words(docs):
    """
    """
    stopwords_list = stopwords.words('english')
    counts = Counter()
    
    for d in docs:
        counts.update([w for w in word_tokenize(d) if w not in stopwords_list])
    
    return counts

def get_top_words_by_stance(filtered_clusters,cluster_2_doc,sampled_df,top=50):
    """
    """
    all_docs_clustered=[]
    for cp in filtered_clusters:
        all_docs_clustered += [sampled_df["processed_text"].iloc[d] for d in cluster_2_doc[cp[0]]]
        all_docs_clustered += [sampled_df["processed_text"].iloc[d] for d in cluster_2_doc[cp[1]]]
    
    counts = get_top_words(all_docs_clustered)
    print(counts.most_common(top))
    

def get_docs_by_stance(cluster,cluster_2_doc_map,sampled_df):
    """
    """
    documents = cluster_2_doc_map[cluster]
    conservative_docs = [d for d in documents if sampled_df["binary_ps"].iloc[d] == 1]
    liberal_docs = [d for d in documents if sampled_df["binary_ps"].iloc[d] == 0]
    
    print("Conservative Documents - Random 5 :\n")
    for i in conservative_docs[:5]:
        print(sampled_df["title"].iloc[i] + "\n" +sampled_df["text"].iloc[i]+"\n")
    
    print("Liberal Documents - Random 5 :\n")
    for i in liberal_docs[:5]:
        print(sampled_df["title"].iloc[i] + "\n" +sampled_df["text"].iloc[i]+"\n")
    
    print("\nTop Words in Conservative Documents : \n")
    print(get_top_words([sampled_df["processed_text"].iloc[i] for i in conservative_docs]).most_common(30))
    
    print("\nTop Words in Liberal Documents : \n")
    print(get_top_words([sampled_df["processed_text"].iloc[i] for i in liberal_docs]).most_common(30))
    
    print("\nDifference between Top 500 Words in both Stance Categories :")
    conserv_top_100 = [w[0] for w in get_top_words([sampled_df["processed_text"].iloc[i] for i in conservative_docs]).most_common(500)]
    liberal_top_100 = [w[0] for w in get_top_words([sampled_df["processed_text"].iloc[i] for i in liberal_docs]).most_common(500)]
    print("\nTop Words in Conservative Stance Documents only :")
    print(set(conserv_top_100) - (set(liberal_top_100)))
    print("\nTop Words in Liberal Stance Documents only :")
    print(set(liberal_top_100) - (set(conserv_top_100)))


def get_general_words_per_stance_across_clusters(cluster_pairs,cluster_2_doc_map,sampled_df):
    """
    """
    all_cluster1, all_cluster2 = zip(*cluster_pairs)
    
    all_clusters = list(set(all_cluster1 + all_cluster2))
    
    all_related_docs = []
    
    for c in all_clusters:
        all_related_docs += cluster_2_doc_map[c]
    
    all_related_docs = list(set(all_related_docs))
    
    conservative_docs = [d for d in all_related_docs if sampled_df["binary_ps"].iloc[d] == 1]
    liberal_docs = [d for d in all_related_docs if sampled_df["binary_ps"].iloc[d] == 0]
    
    print("\nDifference between Top 2000 Words in both Stance Categories :")
    conserv_top_100 = [w[0] for w in get_top_words([sampled_df["processed_text"].iloc[i] for i in conservative_docs]).most_common(2000)]
    liberal_top_100 = [w[0] for w in get_top_words([sampled_df["processed_text"].iloc[i] for i in liberal_docs]).most_common(2000)]
    
    print("\nTop Words in Conservative Stance Documents only :")
    print(set(conserv_top_100) - (set(liberal_top_100)))
    print("\nTop Words in Liberal Stance Documents only :")
    print(set(liberal_top_100) - (set(conserv_top_100)))
    

## Vectorization and Clustering

In [5]:
path = "../articles.csv"

main_df = load_data(path)

sampled_df = sample_data(df=main_df,sample_size=100000,seed=RANDOM_SEED)
print("Sampled Size: %s" %str(sampled_df.shape[0]))

sampled_df["processed_text"] = preprocess_texts(text_lists=sampled_df["text"])

vectors,vocab,tfidf_vectorizer = tfidf_vectorization(df=sampled_df,min_df=50,max_df=0.75,seed=RANDOM_SEED)

reduced_vectors = dimensionality_reduction(vectors=vectors,mode="SVD_LSA",dim=500,seed=RANDOM_SEED)

clusters,cluster_clf = run_clustering(vectors=reduced_vectors,seed=RANDOM_SEED,num_clusters=1000,clus_type="kmeans")

cluster_sizes = get_cluster_sizes(cluster_clf)

cluster_pair_dist_mat = get_pairwise_dist(cluster_clf,dist_type="cosine")

cluster_pairs = get_cluster_pairs(num_clusters=1000)
print(len(cluster_pairs))

Index(['article_id', 'url', 'title', 'text', 'source', 'source_partisan_score',
       'tweet_id', 'tweet_screen_name', 'tweet_created_at', 'tweet_text'],
      dtype='object')
Df original shape : (921037, 10)
Df shape after dropping nan text : (919430, 10)
Df shape after dropping duplicate articles based on title : (912084, 10)
Df shape after dropping 0 stance articles : (630425, 10)

Finished running 'load_data' in 0.4244 mins


Finished running 'sample_data' in 0.0018 mins

Sampled Size: 100000
Running : select_first10
Running : to_lower
Running : remove_punc
Running : remove_small_words
Running : remove_spaces

Finished running 'preprocess_texts' in 0.3099 mins

vocab_size : 16829

Finished running 'tfidf_vectorization' in 0.1575 mins


Shape Before DIM REDUC : (100000, 16829)
Shape After DIM REDUC : (100000, 16829)

Finished running 'dimensionality_reduction' in 0.9020 mins


Running KMEANS Clustering with k=1000

Finished running 'run_clustering' in 0.1937 mins


Finished running

In [6]:
doc_2_cluster_map = cluster2doc(num_texts=sampled_df.shape[0],cluster_labels=cluster_clf.labels_)


Finished running 'cluster2doc' in 0.0003 mins



In [7]:
filtered_cluster_pairs = filter_clusters(cluster_pairs=cluster_pairs,
                                        doc_2_cluster_map=doc_2_cluster_map,
                                        cluster_sizes=cluster_sizes,
                                        partisan_scores=sampled_df["binary_ps"].tolist(),
                                        min_size=400,
                                        max_size=5000,
                                        min_partisan_size=0.4)

print("Filtered CLustered Pairs : %s" %str(len(filtered_cluster_pairs)))


Finished running 'filter_clusters' in 0.0734 mins

Filtered CLustered Pairs : 91


In [8]:
# Now we need to pick a given cluster pair and go through a few documents having opposite stance

In [9]:
# Check 5 cluster pairs
print(filtered_cluster_pairs[:5])

[(70, 121), (70, 187), (70, 272), (70, 305), (70, 308)]


# Manual Content Analysis - Observing Words with High Frequency

## Cluster 70 - Trump Impeachment

In [35]:
get_docs_by_stance(cluster=70,
                   cluster_2_doc_map=doc_2_cluster_map,
                   sampled_df=sampled_df)

Conservative Documents - Random 5 :

Jessica Tarlov: Trump should be impeached – Republicans who blindly support him are profiles in cowardice
When President Bill Clinton was impeached by the House of Representatives on Dec. 19, 1998 – almost exactly 21 years ago – I was 14 and paying more attention to playing basketball, boys and homework. I knew the impeachment was taking place, but didn’t follow every development.          Things are very different today. I can’t seem to stop paying attention and am oftentimes overwhelmed by the gravity of what we’re witnessing.          Though it’s my job to pay attention, the dynamics of this story – from the president’s abuse of power, to the geopolitics, to the personal stories of those who have testified – would draw anyone in.          Our reality TV president is now the subject of a reality TV impeachment filled with dramatic twists and turns – and the show isn’t over yet.          The evidence in favor of impeachment seems overwhelming to me

#### This cluster looks like it is talking about Trump Impeachment  

#### Interesting Conservative Words for this Cluster  
1) claims  
2) policies  
3) investigating  
4) accusations  

#### Interesting Liberal Words for this Cluster
1) scandal  
2) complaint  
3) treason  
4) violation  
5) conservative  

## Cluster 272 - US-MILITARY (IRAQ Situation)

In [36]:
get_docs_by_stance(cluster=272,
                   cluster_2_doc_map=doc_2_cluster_map,
                   sampled_df=sampled_df)

Conservative Documents - Random 5 :

Iran announces arrests over downing of Ukrainian plane
DUBAI, United Arab Emirates (AP) — Iran’s judiciary said Tuesday arrests have been made over the accidental shootdown of a Ukrainian passenger plane that killed all 176 people on board just after takeoff from Tehran.          The announcement came shortly after Iran’s president called for a special court to be set up to probe the downing last week of the plane by Iranian forces.          Judiciary spokesman Gholamhossein Esmaili was quoted by Iranian state media saying that “extensive investigations have taken place and some individuals are arrested. He did not say how many individuals have been detained or name them.          Iran, which initially dismissed allegations that a missile had brought down the jetliner, acknowledged - three days after Wednesday’s downing and in the face of mounting evidence - that its Revolutionary Guard had shot down the Ukrainian plane by mistake.          “The jud

#### This cluster looks like it is talking about US-Military (Specifically about the Iran Situation) 

#### Interesting Conservative Words for this Cluster  
1) airstrike  
2) freedom  
3) democrats  
4) embargo  
5) hezbollah
6) rebels

#### Interesting Liberal Words for this Cluster
1) threatened  
2) casualties  
3) evidence  
4) family  
5) victims  

### Cluster 305 - US China Relations (Trade, covid 19 etc ..)

In [37]:
get_docs_by_stance(cluster=305,
                   cluster_2_doc_map=doc_2_cluster_map,
                   sampled_df=sampled_df)

Conservative Documents - Random 5 :

America must stop helping China’s regime grow richer and more oppressive
Suppose you had a neighbor who beat his wife, abused his children, engaged in violent crimes and routinely burgled your home. Would you invite him for Sunday brunch? Go into business with him? Share a bungalow at the beach? I don’t think so. So why are we still pretending that China is just one trade agreement away from becoming anything other than the nation-state version of the odious character I’ve described above?          Here’s an incomplete list of the nefarious activities undertaken by the ruling Communist Party of China:          Incarcerating Muslim Uighurs in “re-education” camps; colonizing Tibet; organ-harvesting from prisoners of conscience; suppressing the people of Hong Kong in violation of treaty obligations; stealing hundreds of billions of dollars of American intellectual property, including defense secrets year after year; forcing American corporations to ko

#### This cluster looks like it is talking about US-Chinese Trade war and relations

#### Interesting Conservative Words for this Cluster  
1) dollars   
2) democracy  
3) risk  
4) crackdown

#### Interesting Liberal Words for this Cluster
1) construction  
2) attack  
3) authoritarian  
4) capital  

## Top Words By Stance Across All Clusters

In [10]:
get_general_words_per_stance_across_clusters(cluster_pairs=filtered_cluster_pairs,
                                             cluster_2_doc_map=doc_2_cluster_map,
                                             sampled_df=sampled_df)


Difference between Top 2000 Words in both Stance Categories :

Top Words in Conservative Stance Documents only :
{'wish', 'controlled', 'discussed', 'surgery', 'gaffe', 'zoom', 'click', 'jane', 'panel', 'voter', 'claiming', 'obviously', 'hunter', 'lie', 'automatic', 'podcast', 'brain', 'breitbart', 'bars', 'fit', 'soviet', 'picked', 'receiving', 'clip', 'promote', 'meetings', 'village', 'overall', 'app', 'trillion', 'slammed', 'gang', 'incredibly', 'path', 'prior', 'behalf', 'nba', 'ice', 'christian', 'loan', 'delivery', 'fuel', 'hezbollah', 'healthcare', 'imagine', 'count', 'turns', 'rejected', 'traffic', 'train', 'direction', 'brothers', 'connecticut', 'donations', 'failing', 'driver', 'baby', 'boss', 'citizen', 'dog', 'regarding', 'losing', 'fake', 'courses', 'outlet', 'semi', 'mate', 'navy', 'handed', 'concluded', 'veteran', 'banned', 'hosted', 'commander', 'van', 'adam', 'please', 'missiles', 'mrs', 'examiner', 'basis', 'surrounding', 'stock', 'brooklyn', 'runner', 'customer', 'c

# Identifying Top Words indicating stance using Classifiers

In [11]:
x = [1,2,3,4]
print(x[-2:])

[3, 4]


In [12]:
# need a classfier that can identify topic and stance 
# * say we have a general classifier for stance so this gives us top coefficients for positive and negative across topics
# * say we have multiple classifiers per topic to get stance 
# * find overlap between these

In [13]:
# Train cllassifiers per cluster and find the top words per stance
# Check overlap and difference to identify which are topic specific and which are general

def run_keyword_identifier_clf_per_cluster(cluster_pairs,vectors,doc_2_cluster_map,sample_df,vocab,top_feats):
    """
    At the moment we have classifier for each topic and learns to distinguish between stances 
    """
    all_cluster1, all_cluster2 = zip(*cluster_pairs)
    
    all_clusters = list(set(all_cluster1 + all_cluster2))
    
    top_feats_map = defaultdict(lambda : defaultdict(list))
    
    print("Number of Clusters after Filtering : %s\n" %str(len(all_clusters)))
    
    for cluster in all_clusters:
        
        print("\n*********************** Finding top feats for cluster : %s ***********************"%str(cluster))
        
        X = vectors[doc_2_cluster_map[cluster],:]
        
        y = [sample_df["binary_ps"].iloc[d] for d in doc_2_cluster_map[cluster]]
        
        print("\nLabel Dist : \n%s"%str(Counter(y).most_common()))
        
        clf = LogisticRegressionCV(Cs=[1,10,100,1000],cv=5,random_state=RANDOM_SEED,max_iter=1000,n_jobs=-1,class_weight="balanced", scoring="f1_macro")
        
        clf.fit(X, y)
        
        print("\nCV Folds Params : \n%s"%str(np.max(clf.scores_[1])))
        
        print("\nBest C : \n%s"%str(clf.C_))
        
        coeffs = clf.coef_[0] #(1,n_features)
        
        feature_imp = [(x,y) for x,y in zip(vocab,coeffs)]
        
        feature_imp = sorted(feature_imp,key=lambda x: x[1],reverse=True)
        
        top_feats_map[cluster]["+ve"] = feature_imp[:top_feats]
        top_feats_map[cluster]["-ve"] = feature_imp[-top_feats:]
        
    
    return top_feats_map

def run_keyword_identifier_clf_all_cluster(cluster_pairs,vectors,doc_2_cluster_map,sample_df,vocab,top_feats):
    """
    """
    pass
        
        
    
def get_overlap_keywords_by_stance(top_feats_map,threshold = 2):
    """
    Here threshold signifies how many classifiers trained on clusters pick the given word as a top feature
    
    What happens when a word is conservative on one topic and liberal in the other
    """
    neg_words = Counter()
    pos_words = Counter()
    
    for cluster in top_feats_map:
        pos_words.update([w_tup[0] for w_tup in top_feats_map[cluster]["+ve"]])
        neg_words.update([w_tup[0] for w_tup in top_feats_map[cluster]["-ve"]])
    
    overlapping_pos_words = [p for p in pos_words.most_common() if p[1]>=threshold]
    overlapping_neg_words = [n for n in neg_words.most_common() if n[1]>=threshold]
    
    cross_stance_words = list(set([w[0] for w in overlapping_pos_words]).intersection([w[0] for w in overlapping_neg_words]))
    
    overlapping_pos_words = [w for w in overlapping_pos_words if w[0] not in cross_stance_words]
    overlapping_neg_words = [w for w in overlapping_neg_words if w[0] not in cross_stance_words]
    
    
    pos_df = pd.DataFrame(overlapping_pos_words,columns=["Word","clf_overlap"])
    neg_df = pd.DataFrame(overlapping_neg_words,columns=["Word","clf_overlap"])
    cross_df = pd.DataFrame(cross_stance_words,columns=["Word"])
    
    return pos_df, neg_df, cross_df

def get_topic_specific_keywords(top_feats_map,pos_df,neg_df,cross_df):
    """
    """
    pos_overlap_words = pos_df["Word"].tolist()
    neg_overlap_words = neg_df["Word"].tolist()
    cross_stance_words = cross_df["Word"].tolist()
    
    pos_writer = pd.ExcelWriter('topic_specific_conservative_keywords.xlsx')
    neg_writer = pd.ExcelWriter('topic_specific_liberal_keywords.xlsx')
    
    for cluster in top_feats_map:
        pdf = pd.DataFrame([w for w in top_feats_map[cluster]["+ve"] if w[0] not in pos_overlap_words and w[0] not in cross_stance_words], columns=["Word","Coeff_"])
        ndf = pd.DataFrame([w for w in top_feats_map[cluster]["-ve"] if w[0] not in neg_overlap_words and w[0] not in cross_stance_words], columns=["Word","Coeff_"])
        
        pdf.to_excel(pos_writer, index=False, sheet_name='%s'%str(cluster))
        ndf.to_excel(neg_writer, index=False, sheet_name='%s'%str(cluster))
    
    pos_writer.save()
    neg_writer.save()

In [14]:
print(len(vocab))

16829


In [15]:
top_feats_map = run_keyword_identifier_clf_per_cluster(cluster_pairs=filtered_cluster_pairs,
                                           vectors=vectors,
                                           doc_2_cluster_map=doc_2_cluster_map,
                                           sample_df=sampled_df,
                                           vocab=vocab,
                                           top_feats=500)

Number of Clusters after Filtering : 14


*********************** Finding top feats for cluster : 931 ***********************

Label Dist : 
[(0, 236), (1, 197)]

CV Folds Params : 
0.7372222222222222

Best C : 
[100]

*********************** Finding top feats for cluster : 70 ***********************

Label Dist : 
[(0, 281), (1, 247)]

CV Folds Params : 
0.759725400457666

Best C : 
[10]

*********************** Finding top feats for cluster : 423 ***********************

Label Dist : 
[(1, 384), (0, 370)]

CV Folds Params : 
0.7345288326300985

Best C : 
[1]

*********************** Finding top feats for cluster : 939 ***********************

Label Dist : 
[(0, 250), (1, 168)]

CV Folds Params : 
0.7144963144963146

Best C : 
[10]

*********************** Finding top feats for cluster : 942 ***********************

Label Dist : 
[(1, 347), (0, 243)]

CV Folds Params : 
0.806845238095238

Best C : 
[10]

*********************** Finding top feats for cluster : 272 *********************

In [17]:
pos_df, neg_df, cross_df = get_overlap_keywords_by_stance(top_feats_map,threshold = 5)

In [18]:
neg_df

Unnamed: 0,Word,clf_overlap
0,reunite,12
1,rough,12
2,reluctant,12
3,apologizing,12
4,tough,11
...,...,...
106,religions,5
107,hike,5
108,triggered,5
109,codes,5


In [19]:
neg_df.to_csv("liberal_keywords_general_across_topics.csv",index=False)

In [20]:
pos_df

Unnamed: 0,Word,clf_overlap
0,paso,12
1,harness,12
2,brawl,9
3,goers,9
4,haunted,9
...,...,...
85,october,5
86,closed,5
87,murray,5
88,suspected,5


In [21]:
pos_df.to_csv("conservative_keywords_general_across_topics.csv",index=False)

In [22]:
cross_df

Unnamed: 0,Word
0,classification
1,assemblyman


In [23]:
cross_df.to_csv("word_that_shift_stance_based_on_topic.csv",index=False)

In [24]:
get_topic_specific_keywords(top_feats_map,pos_df,neg_df,cross_df)

In [None]:
def get_articles_by_word(word,sample_df,cluster_2_doc_map,filtered_clustered_pairs):
    """
    For a given word return the list of documents (from our clustered collection) that contain it
    (should be the preprocessed version)
    """
    pass

def get_bert_reps_(docs,word):
    """
    """
    pass