In [3]:
import pandas as pd
import numpy as np

import os
import boto3
import joblib

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import Greene_dnmf as dnmf

from multiprocess import Pool

In [6]:
client = boto3.client('s3')

# procedural stop words
with open('procedural_stop_words.txt','r') as File:
    procedural_stop_words = File.read().split(',')

In [15]:
def term_rankings(H,terms,ntop):
    term_rankings = []
    for topic_index in range(H.shape[0]):
        top_indices = np.argsort(H[topic_index,:])[::-1]
        term_ranking = [terms[i] for i in top_indices[:ntop]]
        term_rankings.append(term_ranking)
    return term_rankings

def run_window_NMF(info):
    party,congress = info

    k = 80 # 80 window topics
    
    DF = pd.read_csv(client.get_object(Bucket='ascsagemaker',
                                       Key=f'JMP_congressional_nmf/House_bigrams/{congress:0>3}.csv')['Body'])
    
    # subset dataframe House data and party
    DF = DF.loc[(DF.chamber_x == 'H') & (DF.party == party[0])]
    DF['date'] = pd.to_datetime(DF.date)
    
    years = pd.to_datetime(DF.date).dt.year.unique() # what years are included in this congress

    # congress 112 has one month of 2013 so don't include it
    if congress == 112:
        years = years[:2]
        
    # for every year process and run topic model
    models = []
    for year in years:
        # subset years data
        sub_df = DF.loc[DF.date.dt.year == year]
        
        # DTM
        vectorizer = TfidfVectorizer(min_df=0.001,max_df=0.30,stop_words=procedural_stop_words,use_idf=True)
        dtm = vectorizer.fit_transform(sub_df.speech_processed)
        vocab = vectorizer.get_feature_names()

        # run model
        model = NMF(n_components=k,max_iter=5000,init='nndsvd')
        W = model.fit_transform(dtm)
        H = model.components_
        
        print(f'{party} - {year} - {len(sub_df)} speeches')
        models.append({"W":W,
                       "H":H,
                       "vocab":vocab,
                       "year":year,
                       "party":party,
                       "window_topic_ids":[f'{party}_{year}_{i}' for i in range(k)],
                       "terms":term_rankings(H,vocab,ntop=20)})
    return models

In [None]:
# make list of models to run
Info = [('Rep',i) for i in range(97,115)]
Info.extend([('Dem',i) for i in range(97,115)])

with Pool(10) as p:
    output = p.map(run_window_NMF,Info)
    
outputs = [model for sublist in output for model in sublist]

Rep - 1983 - 8116 speeches
Rep - 1981 - 6420 speeches
Rep - 1993 - 9969 speeches
Rep - 1989 - 6934 speeches
Rep - 1997 - 9744 speeches
Rep - 1995 - 16578 speeches
Rep - 1991 - 8871 speeches
Rep - 1987 - 9037 speeches
Rep - 1999 - 9606 speeches
Rep - 1985 - 10018 speeches
Rep - 1990 - 8782 speeches
Rep - 1994 - 8651 speeches
Rep - 1992 - 7688 speeches
Rep - 1982 - 7857 speeches
Rep - 1984 - 9366 speeches
Rep - 1996 - 10033 speeches
Rep - 1986 - 8056 speeches
Rep - 1988 - 7581 speeches
Rep - 2001 - 7075 speeches
Rep - 2005 - 8122 speeches
Rep - 2000 - 9299 speeches
Rep - 2003 - 7396 speeches
Rep - 2013 - 7100 speeches
Rep - 2009 - 7008 speeches
Rep - 2011 - 8722 speeches
Rep - 1998 - 9463 speeches
Rep - 2004 - 6491 speeches
Rep - 2007 - 10534 speeches
Rep - 2002 - 5843 speeches
Rep - 2006 - 7074 speeches
Rep - 2010 - 6608 speeches
Dem - 1981 - 9944 speeches
Rep - 2015 - 7781 speeches
Rep - 2012 - 6074 speeches
Dem - 1983 - 11703 speeches
Rep - 2008 - 6418 speeches
Rep - 2016 - 4952 speec

In [18]:
# compile window topic model collection
collection = dnmf.TopicCollection()
for model in outputs:
    collection.add_topic_model(model['H'],model['vocab'],model['window_topic_ids'])

# make DNTM Topic Term Matrix
Mat, full_vocab = collection.create_matrix()


## Testing varying numbers of dynamic topics

In [20]:
def run_DTM(k):
    second_level = NMF(n_components=k,max_iter=5000,init='nndsvda')
    W = second_level.fit_transform(Mat)
    H = second_level.components_
    terms = term_rankings(H,full_vocab,ntop=20)
    print(k)
    return {"K":k,"W":W,'H':H,'terms':terms}


with Pool(10) as p:
    dtm_models = p.map(run_DTM,range(50,210,10))

70
60
90
80
50
110
130
100
140
120
160
150
170
180
200
190


In [37]:
def jaccard_Binary(termsA,termsB):
    sx = set(termsA)
    sy = set(termsB)
    numerator = len(sx.intersection(sy))
    if numerator == 0:
        return 0
    denominator = len(sx.union(sy))
    if denominator == 0:
        return 0
    return float(numerator)/denominator


def test_topic_overlap(ts_smaller,ts_bigger):
    # get percent of smaller models topics still in larger (jaccard of 0.25)
    Topic_dict = {}
    for a,topicA in enumerate(ts_smaller):
        Topic_dict[a] = []
        for b,topicB in enumerate(ts_bigger):
            if jaccard_Binary(topicA,topicB) >= 0.25:
                Topic_dict[a].append(b)
                
    pct_remained = sum([1 for i in Topic_dict.values() if i])/len(ts_smaller)
    Larger_not_included = [i for i in range(len(ts_bigger)) if i not in set([item for sublist in Topic_dict.values() for item in sublist])]
    print(f'{np.round(pct_remained*100,2)}% of topics in smaller model are in larger model',end='\n')
    print(f'there were {len(Larger_not_included)} topics in the larger model not in the smaller one\n')
    
    filled_out_dict = {}
    for key in Topic_dict.keys():
        filled_out_dict[','.join(ts_smaller[key])] = []
        for val in Topic_dict[key]:
            filled_out_dict[','.join(ts_smaller[key])].append(','.join(ts_bigger[val]))
            
    return pct_remained, Larger_not_included, Topic_dict,filled_out_dict

### Dynamic Model with 130 topic selected

In [75]:
smaller = [model for model in dtm_models if model['K'] == 130][0]['terms']
larger = [model for model in dtm_models if model['K'] == 160][0]['terms']

_,not_in_smaller, td, filled = test_topic_overlap(smaller,larger)
for lg_t in not_in_smaller:
    print(larger[lg_t],end='\n\n')

100.0% of topics in smaller model are in larger model
there were 16 topics in the larger model not in the smaller one

['support', 'effort', 'rise_strong', 'rise_support', 'many', 'full', 'compromise', 'passage', 'strong', 'national', 'deserve', 'side_aisle', 'initiative', 'chairman_subcommittee', 'dairy', 'level', 'package', 'position', 'hope', 'strong_support']

['pipeline', 'keystone_pipeline', 'safety', 'natural_gas', 'canada', 'permit', 'keystone', 'coal', 'gas', 'tar_sand', 'approve', 'spill', 'construction', 'build', 'approval', 'safe', 'transport', 'refinery', 'decision', 'railroad']

['site', 'waste', 'superfund', 'nuclear_waste', 'cleanup', 'facility', 'yucca_mountain', 'environmental', 'nuclear', 'clean', 'hazardous_waste', 'disposal', 'repository', 'plant', 'doe', 'store', 'national_park', 'park', 'list', 'nuclear_power']

['regulation', 'regulatory', 'safety', 'osha', 'regulate', 'require', 'regulator', 'review', 'impose', 'consumer', 'impact', 'requirement', 'unnecessary'

## label topics

In [104]:
DTM_labels = pd.read_csv('DTM_codes_130.csv')
DTM_labels.head()

Unnamed: 0,d_ix,sub_topic,major_topic
0,0,history,international affairs
1,1,national budget,macroeconomics
2,2,,procedural
3,3,,procedural
4,4,family issues,law crime


In [166]:
DTM_labels['topic_label'] = DTM_labels.major_topic + '_' + DTM_labels.sub_topic.astype(str)

## Mapping Dynamic Topics to Window Topics

In [168]:
dtm = [model for model in dtm_models if model['K'] == 130][0]
dtm_assigned = dtm['W'].argmax(1)

mapped = []
for window_model in outputs:
    for topic_id in window_model['window_topic_ids']:
        id_index = int(topic_id.split('_')[-1])
        row = {"year":window_model['year'],
               'party':window_model['party'],
               'window_id':id_index,
               'terms':window_model['terms'][id_index]}
        mapped.append(row)
        
mapped_df = pd.DataFrame(mapped)
mapped_df['dynamic_id'] = dtm_assigned
mapped_df = mapped_df.merge(DTM_labels[['d_ix','topic_label']],left_on='dynamic_id',right_on='d_ix',how='left')
mapped_df = mapped_df.drop('d_ix',1)

In [169]:
mapped_df

Unnamed: 0,year,party,window_id,terms,dynamic_id,topic_label
0,1981,Rep,0,"[president, president_reagan, veto, economic_r...",8,government operations_executive
1,1981,Rep,1,"[spending, american_people, government, reduce...",44,macroeconomics_spending
2,1981,Rep,2,"[tax, taxis, conablehance, way_mean, income, t...",13,macroeconomics_taxation
3,1981,Rep,3,"[social_security, system, trust_fund, reform, ...",33,labor_employee benefits
4,1981,Rep,4,"[intelligence, information, protect, agent, di...",115,defense_intelligence
...,...,...,...,...,...,...
5755,2016,Dem,75,"[sec, investor, shareholder, disclosure, title...",70,?_?
5756,2016,Dem,76,"[community, county, help, many, life, region, ...",66,housing_community development
5757,2016,Dem,77,"[great_lake, lake, restoration, lake_erie, reg...",52,public land_water resources
5758,2016,Dem,78,"[restaurant, calorie, menu, information, label...",113,civil rights_privacy


## 528 window topics fell under ambiguous dynamic topics
These window topics were manually recoded

In [171]:
# mapped_df.loc[mapped_df.topic_label.isin(['nan','?_?','?_costs','housing_?'])].to_csv('checking_missed_topics.csv')
fix_windows = pd.read_csv('checking_missed_topics_fixed_130.csv')

In [172]:
fix_windows['long_label'] = fix_windows.major_topic + '_' + fix_windows.sub_topic
fixed_window_labels = fix_windows.loc[-fix_windows.long_label.isna(),['year','party','window_id','long_label']]

In [174]:
mapped_df = mapped_df.merge(fixed_window_labels,on=['year','party','window_id'],how='left')
mapped_df['topic_label'] = mapped_df.apply(lambda x: x.long_label if not type(x.long_label) == float else x.topic_label,1)

In [175]:
mapped_df.loc[mapped_df.topic_label == 'housing_?','topic_label'] = '?_?'

In [176]:
mapped_df.topic_label.unique()

array(['government operations_executive', 'macroeconomics_spending',
       'macroeconomics_taxation', 'labor_employee benefits',
       'defense_intelligence', 'procedural_nan',
       'domestic commerce_general', 'law crime_courts',
       'international affairs_human rights',
       'government operations_states', 'defense_procurement',
       'macroeconomics_national budget',
       'international affairs_middle east', '?_?',
       'transportation_air travel', 'social welfare_general',
       'agriculture_general', 'international affairs_foreign aid',
       'social welfare_veterans', nan, 'transportation_public works',
       '?_costs', 'defense_civil', 'domestic commerce_manufacturing',
       'technology_research', 'government operations_appropriations',
       'energy_alternatives', 'education_elementary secondary',
       'domestic commerce_banking', 'social welfare_elderly care',
       'public land_general', 'environment_general',
       'transportation_infrastructure', 'ci

In [185]:
ambiguous_topics = mapped_df.loc[mapped_df.topic_label.isin(['nan','?_?','?_costs','housing_?'])]
print(f"{len(ambiguous_topics)} topics are ambiguous, or {100*(len(ambiguous_topics)/len(mapped_df)):0.2f}% of window topics")

380 topics are ambiguous, or 6.60% of window topics


### Save Results

In [189]:
Final = {"window_models":outputs,
         "dynamic_model":{"k":130,'H':dtm['H'],'W':dtm['W'],'collection_mat':Mat,'collection_vocab':full_vocab},
         'mapper':mapped_df}

with open('Official_dissertation_model.pkl', 'wb') as File:
    joblib.dump(Final,File)