In [2]:
from general_utils import timer

from config import RANDOM_SEED

from preprocess_utils import preprocess_texts, tfidf_vectorization, dimensionality_reduction

from clustering_utils import run_clustering, get_cluster_sizes, score_cluster, get_cluster_pairs, get_pairwise_dist, cluster2doc, filter_clusters, get_top_100_clusterpairs

from data_utils import load_data, sample_data, balanced_sampling, create_train_test

from collections import Counter, defaultdict

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegressionCV

from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.preprocessing import StandardScaler
from nltk.corpus import stopwords
import pickle

Data:  
1) Relevant with topic labels (some have no topic labels) -> labeled_political_articles (around 219824 articles)  
2) Balanced Topic labeled data - 40000 articles  

To:Do:  
1) Cluster the first data  
2) don't cluster the 2nd set of data  

Steps:  
1) analyze the clusters  

2) check overlapping words  

3) run classifiers for each cluster -> get top feats  

4) get overlap feats, independent feats  

5) for each bert layer and each term:  

    1) get the bert representation for these words for each cluster (avg of all documents containing that specific word)  
    2) get the topic pairs / cluster pairs similarity -> cosine(c1,c2)  
6) average over all terms for each stance maybe ?  
7) plot the avg similarity vs layer  

In [3]:
def get_top_words(docs):
    """
    """
    stopwords_list = stopwords.words('english')
    counts = Counter()
    
    for d in docs:
        counts.update([w for w in word_tokenize(d) if w not in stopwords_list])
    
    return counts

def get_top_words_by_stance(filtered_clusters,cluster_2_doc,sampled_df,top=50):
    """
    """
    all_docs_clustered=[]
    for cp in filtered_clusters:
        all_docs_clustered += [sampled_df["processed_text"].iloc[d] for d in cluster_2_doc[cp[0]]]
        all_docs_clustered += [sampled_df["processed_text"].iloc[d] for d in cluster_2_doc[cp[1]]]
    
    counts = get_top_words(all_docs_clustered)
    print(counts.most_common(top))
    

def get_docs_by_stance(cluster,cluster_2_doc_map,sampled_df):
    """
    """
    documents = cluster_2_doc_map[cluster]
    conservative_docs = [d for d in documents if sampled_df["binary_ps"].iloc[d] == 1]
    liberal_docs = [d for d in documents if sampled_df["binary_ps"].iloc[d] == 0]
    
    print("Conservative Documents - Random 5 :\n")
    for i in conservative_docs[:5]:
        print(sampled_df["title"].iloc[i] + "\n" +sampled_df["text"].iloc[i]+"\n")
    
    print("Liberal Documents - Random 5 :\n")
    for i in liberal_docs[:5]:
        print(sampled_df["title"].iloc[i] + "\n" +sampled_df["text"].iloc[i]+"\n")
    
    print("\nTop Words in Conservative Documents : \n")
    print(get_top_words([sampled_df["processed_text"].iloc[i] for i in conservative_docs]).most_common(30))
    
    print("\nTop Words in Liberal Documents : \n")
    print(get_top_words([sampled_df["processed_text"].iloc[i] for i in liberal_docs]).most_common(30))
    
    print("\nDifference between Top 500 Words in both Stance Categories :")
    conserv_top_100 = [w[0] for w in get_top_words([sampled_df["processed_text"].iloc[i] for i in conservative_docs]).most_common(500)]
    liberal_top_100 = [w[0] for w in get_top_words([sampled_df["processed_text"].iloc[i] for i in liberal_docs]).most_common(500)]
    print("\nTop Words in Conservative Stance Documents only :")
    print(set(conserv_top_100) - (set(liberal_top_100)))
    print("\nTop Words in Liberal Stance Documents only :")
    print(set(liberal_top_100) - (set(conserv_top_100)))

def get_overlapping_cluster_terms(cluster_pairs,cluster_2_doc_map,sampled_df,overlap_coef=3):
    """
    """
    all_cluster1, all_cluster2 = zip(*cluster_pairs)
    
    all_clusters = list(set(all_cluster1 + all_cluster2))
    
    all_related_docs = []
    
    cluster_top_terms_map = defaultdict(lambda : defaultdict(list))
    
    for c in all_clusters:
        cluster_docs = cluster_2_doc_map[c]
        conserv_docs = [d for d in cluster_docs if sampled_df["binary_ps"].iloc[d] == 1]
        liberal_docs = [d for d in cluster_docs if sampled_df["binary_ps"].iloc[d]==0]
        
        top_conserv_100 = [x[0] for x in get_top_words([sampled_df["processed_text"].iloc[i] for i in conserv_docs]).most_common(1000)]
        top_liberal_100 = [x[0] for x in get_top_words([sampled_df["processed_text"].iloc[i] for i in liberal_docs]).most_common(1000)]
        
        cluster_top_terms_map[c]["conservative"] = top_conserv_100
        cluster_top_terms_map[c]["liberal"] = top_liberal_100
    
    conserv_counter = Counter()
    liberal_counter = Counter()
    
    for c in all_clusters:
        conserv_counter.update(cluster_top_terms_map[c]["conservative"])
        liberal_counter.update(cluster_top_terms_map[c]["liberal"])
    
    top_1000_conserv = [x[0] for x in conserv_counter.most_common(500)]
    top_1000_liberal = [x[0] for x in liberal_counter.most_common(500)]
    
    conserv_only = sorted(list(set(top_1000_conserv) - set(top_1000_liberal)),key=lambda x: x[1],reverse=True)
    liberal_only = sorted(list(set(top_1000_liberal) - set(top_1000_conserv)),key=lambda x: x[1], reverse=True)
    
    print("Top Overlapping Conservative Only Terms : \n")
    print(conserv_only)
    print("Top Overlapping Liberal Only Terms : \n")
    print(liberal_only)
    
    conserv_only = [(x,conserv_counter[x]) for x in conserv_only]
    liberal_only = [(x,liberal_counter[x]) for x in liberal_only]
        
    liberal_df = pd.DataFrame(liberal_only,columns=["Word","Cluster_Overlap"])
    conserv_df = pd.DataFrame(conserv_only,columns=["Word","Cluster_Overlap"])
    
    liberal_df.to_csv("Liberal_Word_Overlap_btw_clusters_rel_data.csv",index=False)
    conserv_df.to_csv("Conserv_Word_Overlap_btw_clusters_rel_data.csv",index=False)
    

In [4]:
def load_pkl_file(file_path):
    """
    """
    article_dict = pickle.load(open(file_path,"rb"))
    articles = []
    for article in article_dict.keys():
        articles.append(article_dict[article])
    
    print("Number of Articles : %s"%str(len(articles)))
    articles_df = pd.DataFrame(articles)
    print("Shape Before Processing : %s" %str(articles_df.shape))
    #drop columns
    articles_df.drop(columns=["article_id",
                              "url",
                              "source",
                              "tweet_id",
                              "tweet_text",
                              "kws_label",
                              "cls_label",
                              "tweet_screen_name",
                              "tweet_created_at"],inplace=True)
    #reset index
    articles_df.reset_index(inplace=True,drop=True)
    #drop partisan of 0.0
    articles_df = articles_df.loc[articles_df["source_partisan_score"] != 0.0]
    articles_df["binary_ps"] = articles_df["source_partisan_score"].apply(lambda x: 1 if x>0 else 0)
    print("Shape after dropping Neutral Articles : %s" %str(articles_df.shape))
    print(articles_df.columns)
    return articles_df

In [5]:
path =  "../labeled_political_articles.pkl"

articles_df = load_pkl_file(path)

sampled_df = sample_data(df=articles_df,sample_size=100000,seed=RANDOM_SEED)
print("Sampled Size: %s" %str(sampled_df.shape[0]))


Number of Articles : 219824
Shape Before Processing : (219824, 12)
Shape after dropping Neutral Articles : (170405, 4)
Index(['title', 'text', 'source_partisan_score', 'binary_ps'], dtype='object')

Finished running 'sample_data' in 0.0007 mins

Sampled Size: 100000


In [6]:
sampled_df.head(20)

Unnamed: 0,title,text,source_partisan_score,binary_ps
0,Top general warns that 'divisiveness leads to ...,(medianame) America's most senior general warn...,-1.0,0
1,How the US government created a fake universit...,The Department of Homeland Security created a ...,-1.0,0
2,It's Time To Say It: Trump Is Handling COVID-1...,US President Donald Trump is handling the coro...,-2.0,0
3,Trump's draconian border lockdown has a new ta...,"For the past year, the bridges that cross from...",-2.0,0
4,Supreme Court clears way for execution of fede...,"TERRE HAUTE, Ind. (AP) — The Trump administrat...",1.0,1
5,"After testy call with Trump over border wall, ...",One Mexican official said Trump “lost his temp...,-1.0,0
6,"Opinion | If Congress had any pride, it would ...","This pertains to the almost 800,000 “dreamers”...",-1.0,0
7,Liz Cheney on Ukraine: 'Starting to Seem Like ...,Rep. Liz Cheney (R-WY) on Monday said an intel...,2.0,1
8,ThinkProgress Smears Dan Crenshaw on ‘Universa...,"At ThinkProgress, Josh Israel miscasts Dan Cre...",2.0,1
9,Pompeo recalls Afghan peace envoy after Trump ...,Secretary of State Mike Pompeo said Sunday tha...,1.0,1


In [7]:
sampled_df["processed_text"] = preprocess_texts(text_lists=sampled_df["text"])
vectors,vocab,tfidf_vectorizer = tfidf_vectorization(df=sampled_df,min_df=50,max_df=0.75,seed=RANDOM_SEED)
# reduced_vectors = dimensionality_reduction(vectors=vectors,mode="SVD_LSA",dim=500,seed=RANDOM_SEED)
reduced_vectors = vectors
clusters,cluster_clf = run_clustering(vectors=reduced_vectors,seed=RANDOM_SEED,num_clusters=100,clus_type="kmeans")
cluster_sizes = get_cluster_sizes(cluster_clf)
cluster_pairs = get_cluster_pairs(num_clusters=100)
print(len(cluster_pairs))

Running : select_first10
Running : to_lower
Running : remove_punc
Running : remove_small_words
Running : remove_spaces

Finished running 'preprocess_texts' in 0.3175 mins

vocab_size : 15008

Finished running 'tfidf_vectorization' in 0.1674 mins


Running KMEANS Clustering with k=100

Finished running 'run_clustering' in 0.0879 mins


Finished running 'get_cluster_sizes' in 0.0002 mins


Number of Cluster Pairs : 4950

Finished running 'get_cluster_pairs' in 0.0000 mins

4950


In [8]:
doc_2_cluster_map = cluster2doc(num_texts=sampled_df.shape[0],cluster_labels=cluster_clf.labels_)


Finished running 'cluster2doc' in 0.0003 mins



In [9]:
doc_2_cluster_map.keys()

dict_keys([70, 66, 99, 32, 12, 8, 41, 86, 87, 15, 23, 44, 68, 9, 19, 92, 93, 61, 13, 51, 0, 62, 48, 75, 91, 45, 7, 97, 5, 53, 3, 59, 16, 90, 55, 88, 38, 17, 49, 60, 35, 84, 27, 24, 31, 25, 1, 78, 46, 2, 81, 74, 26, 69, 83, 54, 33, 22, 21, 43, 34, 28, 57, 18, 67, 6, 10, 98, 56, 71, 42, 39, 94, 58, 96, 4, 76, 95, 80, 50, 40, 36, 89, 47, 29, 65, 85, 30, 52, 37, 20, 14, 11, 73, 82, 64, 77, 79, 72, 63])

In [10]:
filtered_cluster_pairs = filter_clusters(cluster_pairs=cluster_pairs,
                                        doc_2_cluster_map=doc_2_cluster_map,
                                        cluster_sizes=cluster_sizes,
                                        partisan_scores=sampled_df["binary_ps"].tolist(),
                                        min_size=200,
                                        max_size=10000,
                                        min_partisan_size=0.4)

print("Filtered CLustered Pairs : %s" %str(len(filtered_cluster_pairs)))


Finished running 'filter_clusters' in 0.0146 mins

Filtered CLustered Pairs : 666


In [11]:
filtered_cluster_pairs[:10]

[(0, 2),
 (0, 3),
 (0, 6),
 (0, 13),
 (0, 16),
 (0, 17),
 (0, 18),
 (0, 20),
 (0, 21),
 (0, 25)]

In [12]:
filtered_cluster_pairs[-10:]

[(84, 86),
 (84, 87),
 (84, 93),
 (84, 96),
 (86, 87),
 (86, 93),
 (86, 96),
 (87, 93),
 (87, 96),
 (93, 96)]

In [13]:
clusters = []
for cp in filtered_cluster_pairs:
    clusters.append(cp[0])
    clusters.append(cp[1])

print(set(clusters))
print(len(set(clusters)))

{0, 2, 3, 6, 13, 16, 17, 18, 20, 21, 25, 27, 28, 29, 33, 36, 43, 44, 45, 47, 53, 55, 57, 59, 60, 62, 67, 71, 75, 78, 81, 83, 84, 86, 87, 93, 96}
37


In [14]:
get_overlapping_cluster_terms(cluster_pairs=filtered_cluster_pairs,
                              cluster_2_doc_map=doc_2_cluster_map,
                              sampled_df=sampled_df,
                              overlap_coef=3)

Top Overlapping Conservative Only Terms : 

['explained', 'due', 'currently', 'push', 'turned', 'stand', 'stated', 'story', 'street', 'organization', 'criticized', 'protect', 'true', 'brought', 'pro', 'friends', 'criticism', 'primary', 'someone', 'host', 'fox', 'cnn', 'amid', 'claimed', 'claiming', 'liberal', 'video', 'mike', 'michael', 'district', 'hill', 'simply', 'live', 'shows', 'showed', 'thousands', 'throughout', 'declared', 'responded', 'reportedly', 'heard', 'pelosi', 'revealed', 'needs', 'release', 'certain', 'related', 'wanted', 'sanders', 'failed', 'care', 'wall', '2017', '2019']
Top Overlapping Liberal Only Terms : 

['events', 'quickly', 'publicly', 'outside', 'multiple', 'attention', 'step', 'staff', 'critics', 'critical', 'tried', 'growing', 'presidency', 'crisis', 'try', 'provide', 'opportunity', 'spent', 'north', 'gop', 'community', 'worked', 'increasingly', 'university', 'announcement', 'allies', 'always', 'although', 'final', 'significant', 'civil', 'sign', 'kind', '

In [15]:
print(filtered_cluster_pairs[:5])

[(0, 2), (0, 3), (0, 6), (0, 13), (0, 16)]


## Cluster 0 : Documents and Top occuring Terms

In [16]:
get_docs_by_stance(cluster=0,
                   cluster_2_doc_map=doc_2_cluster_map,
                   sampled_df=sampled_df)

Conservative Documents - Random 5 :

If You're Trying To Ban Guns, The Least You Can Do Is Learn The Basics
Can anyone imagine a major newspaper running an op-ed justifying public ignorance on public policy? Actually, not merely justifying the ignorance, but rather arguing that facts only help smother discourse rather than enhance it. It’s improbable. Then again, this is the gun debate. And one side benefits from policy illiteracy.          The Washington Post ran an op-ed by former Gawker writer Adam Weinstein arguing that Second Amendment advocates use “jargon” to bully gun-control supporters. “While debating the merits of various gun control proposals,” he contends, “Second Amendment enthusiasts often diminish, or outright dismiss their views if they use imprecise firearms terminology.”          How dare Second Amendment advocates expect that those passionately arguing to limit their constitutional rights have some rudimentary knowledge of the devices they want to ban? To point out 

## Cluster 2 : Documents and Top occuring Terms

In [17]:
get_docs_by_stance(cluster=2,
                   cluster_2_doc_map=doc_2_cluster_map,
                   sampled_df=sampled_df)

Conservative Documents - Random 5 :

George H.W. Bush, Barack Obama shared realist view of foreign policy

Expert: Visas for China 'Should Be Last Priority' During Coronavirus Crisis
Issuing EB-5 investor visas, which primarily benefit Chinese nationals, “should be the last priority” of the United States federal government in the midst of the Chinese coronavirus crisis, Center for Immigration Studies Director of Policy Jessica Vaughan says. Last week, President Donald Trump signed an executive order slowing green card processing for some foreign nationals trying to enter the U.S. The order authorized the continuation of the EB-5 visa program, which gives out about 10,000 green cards a year on the premise that foreign nationals invest in the U.S. In 2018, nearly half of all EB-5 visas were awarded to Chinese nationals, according to State Department data. In 2017, 75 percent of the visas went to Chinese nationals, and in 2016, 76 percent of the visas went to Chinese nationals. From 2011 

## Cluster 3 : Documents and Top occuring Terms

In [18]:
get_docs_by_stance(cluster=3,
                   cluster_2_doc_map=doc_2_cluster_map,
                   sampled_df=sampled_df)

Conservative Documents - Random 5 :

Poll: Minnesota Now in Play as Biden Lead Slips to Three
A poll released by Emerson College on Tuesday shows presumptive Democratic presidential nominee Joe Biden’s lead over President Donald Trump has slipped to three points in the key battleground state of Minnesota.          The Emerson College poll of 733 likely registered voters in Minnesota was conducted between August 8 and 10, and has a margin of error of 3.6 percent, which means the contest between Biden and Trump in the state is currently a statistical tie.          The poll results continue the trend of momentum at both the national and battleground state level in the direction of President Trump, who has significantly narrowed the gap against Biden over the past three weeks.          President Trump narrowly lost Minnesota and its ten electoral college votes in the 2016 presidential election, garnering just 44,000 fewer votes than Democratic nominee Hillary Clinton.          Minnesota is

## Cluster 6 : Documents and Top occuring Terms

In [19]:
get_docs_by_stance(cluster=6,
                   cluster_2_doc_map=doc_2_cluster_map,
                   sampled_df=sampled_df)

Conservative Documents - Random 5 :

Pete Buttigieg: I would not have wanted my son on Ukraine board
FORT MADISON, Iowa (AP) — Pete Buttigieg says he “would not have wanted to see” his son serving on the board of a Ukrainian natural gas company while he was leading anti-corruption efforts in the country, an implicit criticism of the controversy that has ensnared his 2020 Democratic presidential rival Joe Biden.          Hunter Biden’s position on the board of the company Burisma has been a rallying point for Republicans as they try to defend President Donald Trump against impeachment charges over Trump asking Ukraine’s new president to investigate the former vice president and his son while also withholding crucial U.S. military aid.          Buttigieg, the childless mayor of South Bend, Indiana, said in an Associated Press interview Monday that his administration would “do everything we can to prevent even the appearance of a conflict. That’s very important because as we see it can cr

## Cluster 13 : Documents and Top occuring Terms

In [20]:
get_docs_by_stance(cluster=13,
                   cluster_2_doc_map=doc_2_cluster_map,
                   sampled_df=sampled_df)

Conservative Documents - Random 5 :

Kamala Harris criminal justice plan calls for government loans to marijuana growers
Tucked away inside 2020 Democrat Kamala Harris' criminal justice plan is a proposal to give government subsidies to "socially and economically disadvantaged individuals" who wish to start their own marijuana business.          The California senator's proposal, released Monday, describes how reforming the nation's marijuana laws can help end "mass incarceration," and racial disparities in the criminal justice system. One way to deal with that, Harris proposes, is to give federal dollars to states and localities so they can give loans for qualifying Americans interested in hopping onto the country's partially-legalized agricultural sector.          "[Provide] states and localities with funds to make loans to assist small businesses in the marijuana industry that are owned and controlled by socially and economically disadvantaged individuals," Harris' plan reads.      

In [21]:
# Train cllassifiers per cluster and find the top words per stance
# Check overlap and difference to identify which are topic specific and which are general

def run_keyword_identifier_clf_per_cluster(cluster_pairs,vectors,doc_2_cluster_map,sample_df,vocab,top_feats):
    """
    At the moment we have classifier for each topic and learns to distinguish between stances 
    """
    all_cluster1, all_cluster2 = zip(*cluster_pairs)
    
    all_clusters = list(set(all_cluster1 + all_cluster2))
    
    top_feats_map = defaultdict(lambda : defaultdict(list))
    
    print("Number of Clusters after Filtering : %s\n" %str(len(all_clusters)))
    
    for cluster in all_clusters:
        
        print("\n*********************** Finding top feats for cluster : %s ***********************"%str(cluster))
        
        X = vectors[doc_2_cluster_map[cluster],:]
        
        scaler = StandardScaler()
        
        X = scaler.fit_transform(X.todense())
        
        y = [sample_df["binary_ps"].iloc[d] for d in doc_2_cluster_map[cluster]]
        
        print("\nLabel Dist : \n%s"%str(Counter(y).most_common()))
        
        clf = LogisticRegressionCV(Cs=[1,10,100,1000],cv=5,random_state=RANDOM_SEED,max_iter=1000,n_jobs=-1,class_weight="balanced", scoring="f1_macro")
        
        clf.fit(X, y)
        
        print("\nCV Folds Params : \n%s"%str(np.max(clf.scores_[1])))
        
        print("\nBest C : \n%s"%str(clf.C_))
        
        coeffs = clf.coef_[0] #(1,n_features)
        
        feature_imp = [(x,y) for x,y in zip(vocab,coeffs)]
        
        feature_imp = sorted(feature_imp,key=lambda x: x[1],reverse=True)
        
        top_feats_map[cluster]["+ve"] = feature_imp[:top_feats]
        top_feats_map[cluster]["-ve"] = feature_imp[-top_feats:]
        
    
    return top_feats_map
        
    
def get_overlap_keywords_by_stance(top_feats_map,threshold = 2):
    """
    Here threshold signifies how many classifiers trained on clusters pick the given word as a top feature
    
    What happens when a word is conservative on one topic and liberal in the other
    """
    neg_words = Counter()
    pos_words = Counter()
    
    for cluster in top_feats_map:
        pos_words.update([w_tup[0] for w_tup in top_feats_map[cluster]["+ve"]])
        neg_words.update([w_tup[0] for w_tup in top_feats_map[cluster]["-ve"]])
    
    overlapping_pos_words = [p for p in pos_words.most_common() if p[1]>=threshold]
    overlapping_neg_words = [n for n in neg_words.most_common() if n[1]>=threshold]
    
    cross_stance_words = list(set([w[0] for w in overlapping_pos_words]).intersection([w[0] for w in overlapping_neg_words]))
    
    overlapping_pos_words = [w for w in overlapping_pos_words if w[0] not in cross_stance_words]
    overlapping_neg_words = [w for w in overlapping_neg_words if w[0] not in cross_stance_words]
    
    
    pos_df = pd.DataFrame(overlapping_pos_words,columns=["Word","clf_overlap"])
    neg_df = pd.DataFrame(overlapping_neg_words,columns=["Word","clf_overlap"])
    cross_df = pd.DataFrame(cross_stance_words,columns=["Word"])
    
    return pos_df, neg_df, cross_df

def get_topic_specific_keywords(top_feats_map,pos_df,neg_df,cross_df):
    """
    """
    pos_overlap_words = pos_df["Word"].tolist()
    neg_overlap_words = neg_df["Word"].tolist()
    cross_stance_words = cross_df["Word"].tolist()
    
    pos_writer = pd.ExcelWriter('topic_specific_conservative_keywords_relevant_dataset.xlsx')
    neg_writer = pd.ExcelWriter('topic_specific_liberal_keywords_relevant_dataset.xlsx')
    
    for cluster in top_feats_map:
        pdf = pd.DataFrame([w for w in top_feats_map[cluster]["+ve"] if w[0] not in pos_overlap_words and w[0] not in cross_stance_words], columns=["Word","Coeff_"])
        ndf = pd.DataFrame([w for w in top_feats_map[cluster]["-ve"] if w[0] not in neg_overlap_words and w[0] not in cross_stance_words], columns=["Word","Coeff_"])
        
        pdf.to_excel(pos_writer, index=False, sheet_name='%s'%str(cluster))
        ndf.to_excel(neg_writer, index=False, sheet_name='%s'%str(cluster))
    
    pos_writer.save()
    neg_writer.save()

In [22]:
len(vocab)

15008

In [31]:
top_feats_map = run_keyword_identifier_clf_per_cluster(cluster_pairs=filtered_cluster_pairs,
                                           vectors=vectors,
                                           doc_2_cluster_map=doc_2_cluster_map,
                                           sample_df=sampled_df,
                                           vocab=vocab,
                                           top_feats=200)

Number of Clusters after Filtering : 37


*********************** Finding top feats for cluster : 0 ***********************

Label Dist : 
[(0, 758), (1, 665)]

CV Folds Params : 
0.7539691476185114

Best C : 
[1]

*********************** Finding top feats for cluster : 2 ***********************

Label Dist : 
[(1, 534), (0, 466)]

CV Folds Params : 
0.7314484051582175

Best C : 
[100]

*********************** Finding top feats for cluster : 3 ***********************

Label Dist : 
[(0, 583), (1, 504)]

CV Folds Params : 
0.8253205128205128

Best C : 
[100]

*********************** Finding top feats for cluster : 6 ***********************

Label Dist : 
[(0, 480), (1, 382)]

CV Folds Params : 
0.7746828461114175

Best C : 
[100]

*********************** Finding top feats for cluster : 13 ***********************

Label Dist : 
[(1, 481), (0, 441)]

CV Folds Params : 
0.7544320226541991

Best C : 
[1]

*********************** Finding top feats for cluster : 16 ***********************

La

In [32]:
pos_df, neg_df, cross_df = get_overlap_keywords_by_stance(top_feats_map,threshold = 5)

In [33]:
pos_df

Unnamed: 0,Word,clf_overlap
0,emily,30
1,investigative,28
2,specific,23
3,division,21
4,injection,20
...,...,...
127,holt,5
128,enroll,5
129,denmark,5
130,possess,5


In [34]:
neg_df

Unnamed: 0,Word,clf_overlap
0,desegregation,29
1,hop,29
2,labor,26
3,recalls,20
4,incest,14
...,...,...
124,postal,5
125,kevin,5
126,whiteness,5
127,organs,5


In [35]:
cross_df

Unnamed: 0,Word
0,effects


In [28]:
pos_df.to_csv("Conservative_Overlapping_keywords_using_relevant_dataset.csv",index=False)

In [29]:
neg_df.to_csv("Liberal_Overlapping_keywords_using_relevant_dataset.csv",index=False)

In [30]:
get_topic_specific_keywords(top_feats_map,pos_df,neg_df,cross_df)