# Comparing Topic Models with Parties Together and Apart

There are two potential approaches for modeling Democrat and Republic language in the congressional record and comparing the frames employed by the two parties on specific issues. The first approach is to model all speeches by the two parties together, and then separate the corpus by party after labelling speeches with their most likely topic. A comparison of the top terms can then be done between the two parties. 

The alternative approach is to model the two parties as separate corpora, each with their own topics and term distributions.

In this noteobok, a comparison of the terms returned by these two methods is performed. If the two methods return similar results across three congresses (112,113,114), I will take the less laborious method as the standard for this dissertation research (e.g. parties together).

The number of topics for each model were selected from evaluations performed in `evaluations.ipynb`

In [73]:
import pandas as pd
import numpy as np
from operator import itemgetter
import boto3
from gensim.models import Nmf,TfidfModel,CoherenceModel
from gensim.corpora import Dictionary
from nltk import Counter

%run ../model_developer.py
client= boto3.client('s3')

procedural_stop_words.extend(['be','have','do','go'])

loaded model_developer.py


## 112th congress

### Together

In [2]:
_112th = pd.read_csv(client.get_object(Bucket='ascsagemaker',Key='JMP_congressional_nmf/House_bigrams/112.csv')['Body'])
_112th = _112th.loc[_112th.chamber_x == 'H']

generate corpus for fitting

In [5]:
speeches = [[word for word in speech.split() if word not in procedural_stop_words] for speech in _112th.speech_processed]
dictionary = gensim.corpora.Dictionary(speeches)
dictionary.filter_extremes(no_below=0.001*len(speeches),no_above=.35)
corpus = [dictionary.doc2bow(text) for text in speeches]
model = TfidfModel(corpus)
tfidf = [model[i] for i in corpus]

model = Nmf(corpus = tfidf,
                    id2word = dictionary,
                    num_topics = 30,
                    random_state = np.random.randint(1,1000),
                    normalize = True,
                    passes = 20)

coh_model = CoherenceModel(model = model,
                           texts=speeches,
                           dictionary = dictionary,
                           coherence='c_v',
                           processes=1)

print(coh_model.get_coherence())

0.6071394931611472


In [6]:
for i in range(30):
    print(model.print_topic(i),end='\n\n')

0.073*"budget" + 0.023*"senior" + 0.019*"republican" + 0.017*"medicare" + 0.012*"plan" + 0.011*"end_medicare" + 0.010*"social_security" + 0.009*"pay" + 0.009*"tax_break" + 0.008*"deficit"

0.044*"epa" + 0.015*"clean_air" + 0.012*"standard" + 0.011*"health" + 0.010*"mercury" + 0.009*"pollution" + 0.009*"water" + 0.009*"emission" + 0.007*"state" + 0.007*"clean_water"

0.032*"energy" + 0.031*"oil" + 0.012*"pipeline" + 0.011*"price" + 0.010*"drill" + 0.010*"natural_gas" + 0.008*"gas_price" + 0.008*"keystone_pipeline" + 0.008*"production" + 0.006*"drilling"

0.025*"insurance" + 0.024*"pay" + 0.022*"tax" + 0.022*"health_insurance" + 0.016*"health_care" + 0.013*"coverage" + 0.013*"percent" + 0.012*"obamacare" + 0.011*"taxis" + 0.011*"premium"

0.044*"veteran" + 0.012*"serve" + 0.011*"military" + 0.010*"service" + 0.009*"honor" + 0.006*"va" + 0.006*"sacrifice" + 0.006*"man_woman" + 0.005*"return" + 0.005*"army"

0.046*"resolution" + 0.038*"war" + 0.028*"libya" + 0.015*"president" + 0.013*"powe

In [9]:
Together_Topics = ["Budget", "Environment",'Energy',"Health_care",'Veterans',"Defense_conflict",'International_Affairs',
'Macroeconomics',"Abortion",'Domestic_commerce','Labor',"Law_and_Crime","Health_care",
'Budget',"Budget_deficit",'Health_care','Budget','Health_care','Social_welfare',
'Procedural','Higher_education','Procedural','Small_business','Domestic_commerce_regulation',
'Research_technology','Tribute','NA','Labor','Defense_spending','Defense_intelligence']

In [65]:
doc_topics = model.get_document_topics(tfidf,minimum_probability=None)

In [66]:
_112th['Topic'] = [Together_Topics[max(i,key=lambda item:item[1])[0]] for i in doc_topics]

In [67]:
_112th['Topic']

5               Macroeconomics
6        International_Affairs
7         Defense_intelligence
8                       Budget
9                           NA
                 ...          
45658               Procedural
45659                       NA
45660                   Budget
45661         Defense_conflict
45662     Defense_intelligence
Name: Topic, Length: 29783, dtype: object

In [93]:
def top_terms(x):
    speech = [[word for word in sp.split() if word not in procedural_stop_words] for sp in x]
    sorted_count = Counter([term for sublist in speech for term in sublist])
    top_terms = [i[0] for i in sorted_count.most_common()[:20]]
    return top_terms

In [362]:
Together_Terms = _112th.groupby(['Topic','party']).speech_processed.apply(top_terms).reset_index().pivot(index='Topic',columns=['party'])

In [363]:
Together_Terms

Unnamed: 0_level_0,speech_processed,speech_processed
party,D,R
Topic,Unnamed: 1_level_2,Unnamed: 2_level_2
Abortion,"[woman, abortion, service, health_care, plan_p...","[abortion, plan_parenthood, woman, life, say, ..."
Budget,"[budget, republican, american, people, say, cu...","[budget, spending, people, american, say, cong..."
Budget_deficit,"[cut, people, job, make, spending, say, republ...","[cut, spending, budget, people, make, say, ame..."
Defense_conflict,"[war, resolution, congress, libya, president, ...","[war, resolution, president, libya, congress, ..."
Defense_intelligence,"[time, state, make, support, act, work, americ...","[time, state, act, support, make, work, say, c..."
Defense_spending,"[afghanistan, time, security, support, need, p...","[time, fund, funding, support, security, need,..."
Domestic_commerce,"[program, help, people, homeowner, home, time,...","[program, time, say, need, people, fund, work,..."
Domestic_commerce_regulation,"[regulation, american, job, time, congress, pe...","[regulation, job, cost, business, new, say, am..."
Energy,"[oil, energy, american, say, time, year, job, ...","[energy, oil, american, job, say, country, get..."
Environment,"[epa, act, health, state, clean_air, time, sta...","[epa, job, regulation, state, say, act, time, ..."


In [318]:
def keyness(corpusA,corpusB):
    rowsD = pd.DataFrame([{"term":i,"corpus_a":k} for i,k in corpusA.items()])
    rowsR = pd.DataFrame([{"term":i,"corpus_b":k} for i,k in corpusB.items()])

    rows = rowsD.merge(rowsR,on='term',how='outer').fillna(0)
    rows.columns = ['term','freqA','freqB']

    rows['tot_1'] = sum([i for i in D.values()])
    rows['tot_2'] = sum([i for i in R.values()])

    rows['prob_A'] = rows['freqA']/rows['tot_1']
    rows['odds_A'] = rows['prob_A']/(1 - rows['prob_A'])

    rows['prob_B'] = rows['freqB']/rows['tot_2']
    rows['odds_B'] = rows['prob_B']/(1 - rows['prob_B'])

    rows['log_odds'] = np.log(rows['odds_A']/rows['odds_B'])
    rows.loc[rows.log_odds == -np.inf,'log_odds'] = -1
    rows.loc[rows.log_odds == np.inf,'log_odds'] = 1
    return rows.sort_values(by='log_odds',ascending=False)[['term','log_odds']]

In [359]:
def get_keyness(topic):
    D = _112th.loc[(_112th.party == "D") & (_112th.Topic == topic),'speech_processed'].values
    D = [[word for word in sp.split() if word not in procedural_stop_words] for sp in D]
    D = Counter([term for sublist in D for term in sublist])

    R = _112th.loc[(_112th.party == "R") & (_112th.Topic == topic),'speech_processed'].values
    R = [[word for word in sp.split() if word not in procedural_stop_words] for sp in R]
    R = Counter([term for sublist in R for term in sublist])
    

    key = keyness(D,R)
    key = key.sort_values(by='log_odds',ascending=False)
    Terms = key.head(10)
    Terms.columns = ['Dem_Term','Dem_val']

    Rep_terms = key.tail(10)
    Rep_terms = Rep_terms.sort_values(by='log_odds')

    Terms['Rep_Term'] = Rep_terms['term'].values
    Terms['Rep_val'] = Rep_terms['log_odds'].values
    return Terms

In [360]:
get_keyness('Abortion')

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Dem_Term,Dem_val,Rep_Term,Rep_val
345,reproductive_health,3.686374,innocent,-3.747331
1185,small_business,3.435056,heart,-3.229379
71,screening,3.098589,poll,-3.193011
321,proponent,2.741901,abortion_provider,-3.17433
447,limited,2.667793,obamacare,-3.095857
2607,viability,2.667793,month,-2.941694
261,propose,2.58775,god,-2.892903
1969,preference,2.58775,little,-2.841618
668,breast_cancer,2.454222,sexselection,-2.841618
469,antichoice,2.405432,unborn_child,-2.787572


## Splitting Dem Rep

### Dem

In [None]:
_112th_D = _112th.loc[_112th.party == 'D']
_112th_R = _112th.loc[_112th.party == 'R']

In [355]:
D_speeches = [[word for word in speech.split() if word not in procedural_stop_words] for speech in _112th_D.speech_processed]
D_dictionary = gensim.corpora.Dictionary(D_speeches)
D_dictionary.filter_extremes(no_below=0.001*len(D_speeches),no_above=.35)
D_corpus = [D_dictionary.doc2bow(text) for text in D_speeches]
D_model = TfidfModel(D_corpus)
D_tfidf = [D_model[i] for i in D_corpus]

D_model = Nmf(corpus = D_tfidf,
                    id2word = D_dictionary,
                    num_topics = 30,
                    random_state = np.random.randint(1,1000),
                    normalize = True,
                    passes = 20)

coh_model = CoherenceModel(model = D_model,
                           texts=D_speeches,
                           dictionary = D_dictionary,
                           coherence='c_v',
                           processes=1)

print(coh_model.get_coherence())

0.5835307016416285


In [356]:
for i in range(30):
    print(D_model.print_topic(i),end='\n\n')

0.092*"health_care" + 0.043*"reform" + 0.034*"repeal" + 0.011*"medicare" + 0.010*"affordable" + 0.010*"system" + 0.009*"insurance_company" + 0.007*"deficit" + 0.007*"obamacare" + 0.006*"patient"

0.029*"republican" + 0.021*"vote" + 0.013*"house" + 0.011*"let" + 0.009*"tax_cut" + 0.007*"middle_class" + 0.007*"friend" + 0.007*"dont" + 0.007*"bring" + 0.007*"get"

0.051*"worker" + 0.020*"union" + 0.017*"right" + 0.010*"labor" + 0.009*"trade" + 0.008*"employer" + 0.008*"wage" + 0.008*"colombia" + 0.008*"national_labor" + 0.008*"employee"

0.083*"veteran" + 0.013*"present" + 0.010*"va" + 0.010*"vote" + 0.009*"service" + 0.009*"return" + 0.008*"military" + 0.007*"serve" + 0.007*"veteran_affair" + 0.007*"miss"

0.024*"food" + 0.014*"dont" + 0.013*"get" + 0.010*"know" + 0.009*"that" + 0.008*"theyre" + 0.008*"youre" + 0.008*"cut" + 0.007*"lot" + 0.007*"come"

0.087*"woman" + 0.016*"domestic_violence" + 0.015*"victim" + 0.014*"violence_woman" + 0.011*"violence" + 0.011*"protection" + 0.010*"immi

In [364]:
R_speeches = [[word for word in speech.split() if word not in procedural_stop_words] for speech in _112th_R.speech_processed]
R_dictionary = gensim.corpora.Dictionary(R_speeches)
R_dictionary.filter_extremes(no_below=0.001*len(R_speeches),no_above=.35)
R_corpus = [R_dictionary.doc2bow(text) for text in R_speeches]
R_model = TfidfModel(R_corpus)
R_tfidf = [R_model[i] for i in R_corpus]

R_model = Nmf(corpus = R_tfidf,
                    id2word = R_dictionary,
                    num_topics = 30,
                    random_state = np.random.randint(1,1000),
                    normalize = True,
                    passes = 20)

coh_model = CoherenceModel(model = R_model,
                           texts=R_speeches,
                           dictionary = R_dictionary,
                           coherence='c_v',
                           processes=1)

print(coh_model.get_coherence())

0.5326478397390679


In [365]:
for i in range(30):
    print(R_model.print_topic(i),end='\n\n')

0.061*"health_care" + 0.021*"repeal" + 0.018*"law" + 0.018*"obamacare" + 0.015*"cost" + 0.012*"patient" + 0.011*"reform" + 0.010*"care" + 0.010*"doctor" + 0.010*"insurance"

0.050*"balanced_budget" + 0.028*"debt" + 0.018*"budget" + 0.014*"balance_budget" + 0.012*"balance" + 0.009*"constitution" + 0.009*"deficit" + 0.009*"live_mean" + 0.008*"national_debt" + 0.008*"congress"

0.056*"cut" + 0.021*"funding" + 0.015*"percent" + 0.014*"fiscal_year" + 0.013*"defense" + 0.013*"account" + 0.013*"level" + 0.012*"budget" + 0.011*"reduce" + 0.010*"reduction"

0.036*"border" + 0.012*"mexico" + 0.008*"arizona" + 0.008*"border_patrol" + 0.007*"security" + 0.007*"yucca_mountain" + 0.006*"law_enforcement" + 0.006*"agent" + 0.006*"new_york" + 0.006*"texas"

0.036*"job" + 0.015*"create_job" + 0.011*"business" + 0.011*"agreement" + 0.011*"create" + 0.010*"economy" + 0.010*"export" + 0.008*"america" + 0.008*"job_creator" + 0.007*"company"

0.083*"small_business" + 0.020*"owner" + 0.018*"business" + 0.009*