# NLP
Examine important phrases in addresses using Pointwise Mutual Information (PMI)

In [1]:
import pandas as pd
import nltk

# Data

In [2]:
transcripts = pd.read_csv('data/cln_transcripts.csv', index_col=0)
# formate date column
transcripts['Date'] = pd.to_datetime(transcripts['Date'])
transcripts.head()

Unnamed: 0,Title,Date,President,Type,Transcript
0,"April 30, 1789: First Inaugural Address",1789-04-30,George Washington,inauguration,Fellow Citizens of the Senate and the House of...
1,"October 3, 1789: Thanksgiving Proclamation",1789-10-03,George Washington,other,Whereas it is the duty of all Nations to ackno...
2,"January 8, 1790: First Annual Message to Congress",1790-01-08,George Washington,state_union,Fellow Citizens of the Senate and House of Rep...
3,"December 8, 1790: Second Annual Message to Con...",1790-12-08,George Washington,state_union,Fellow citizens of the Senate and House of Rep...
4,"December 29, 1790: Talk to the Chiefs and Coun...",1790-12-29,George Washington,other,"I the President of the United States, by my ow..."


# Extract Phrases
Extract insightful phrases from addresses

## Filter Address Function
Filter addresses by speaker and date range 

In [3]:
def filt_docs(address_df, pres, start_date, end_date):
    
    # index of filtered addresses
    idx = address_df[
        (address_df['President'] == pres) & 
        (address_df['Date'] >= start_date) & 
        (address_df['Date'] <= end_date)
    ].index
    
    # combine address text into one document
    doc = address_df.iloc[idx, :]['Transcript'].str.cat(sep=' ').lower()
    
    return(doc)

## Top Phrases Function
Return top phrases from given document

In [16]:
def bigram_finder(document, m=1, n=3):
    '''
    document : text data
    m : number of top phrases to return
    n : amount of times phrase appears in document
    '''
    # create tokenizer that ignores punctuation
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    # tokenize document
    words = tokenizer.tokenize(document)
    # create method to find trigram co-occurences in document
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    # create ntlk co-occurence object
    finder = nltk.collocations.BigramCollocationFinder.from_words(words)
    # filter to trigrams that appear n times
    finder.apply_freq_filter(n)
    # return trigram with the most importance (highest PMI)
    try:
        top_phrases = finder.nbest(bigram_measures.pmi, m)
    except:
        top_phrases = ''
    # return top phrase as string
    return([' '.join(phrase) for phrase in top_phrases])

In [10]:
def trigram_finder(document, m=1, n=3):
    '''
    document : text data
    m : number of top phrases to return
    n : amount of times phrase appears in document
    '''
    # create tokenizer that ignores punctuation
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    # tokenize document
    words = tokenizer.tokenize(document)
    # create method to find trigram co-occurences in document
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    # create ntlk co-occurence object
    finder = nltk.collocations.TrigramCollocationFinder.from_words(words)
    # filter to trigrams that appear n times
    finder.apply_freq_filter(n)
    # return trigram with the most importance (highest PMI)
    try:
        top_phrases = finder.nbest(trigram_measures.pmi, m)
    except:
        top_phrases = ''
    # return top phrase as string
    return([' '.join(phrase) for phrase in top_phrases])

# Civil War

## Lincoln

In [11]:
start_date = '1861-04-12'
end_date = '1865-04-09'
pres = 'Abraham Lincoln'

In [12]:
doc = filt_docs(transcripts, pres, start_date, end_date)

In [18]:
# bigram phrases
bigram_finder(doc, m=20, n=5)

['inaugural address',
 'per cent',
 'post office',
 'annual message',
 'supreme court',
 'fort sumter',
 'lee s',
 'four years',
 'years ago',
 'interior region',
 'slave trade',
 'south carolina',
 'compensated emancipation',
 'fellow citizens',
 'new york',
 'postmaster general',
 'invite your',
 '1st day',
 'fiscal year',
 'great britain']

In [19]:
# trigram phrases
trigram_finder(doc, m=20, n=5)

['post office department',
 'last annual message',
 'lee s army',
 'house of representatives',
 'the inaugural address',
 'leaving a balance',
 'senate and house',
 'the post office',
 'from all sources',
 'day of january',
 'receipts and disbursements',
 'fellow citizens of',
 'our relations with',
 '1st day of',
 'so far as',
 'the postmaster general',
 'upon this subject',
 'the 1st day',
 'it is hoped',
 'if you are']

# World War 2

## FDR

In [22]:
start_date = '1939-09-01'
end_date = '1945-09-02'
pres = 'Franklin D. Roosevelt'

In [23]:
doc = filt_docs(transcripts, pres, start_date, end_date)

In [24]:
bigram_finder(doc, m=20, n=5)

['dumbarton oaks',
 'chiang kai',
 'kai shek',
 'prime minister',
 'san francisco',
 'generalissimo chiang',
 'dr wassell',
 'white house',
 'marshal stalin',
 'unconditional surrender',
 'pearl harbor',
 'billion dollars',
 'raw materials',
 'overwhelming majority',
 'indian ocean',
 'polish provisional',
 'selective service',
 'mr churchill',
 'general macarthur',
 'farm products']

In [25]:
trigram_finder(doc, m=20, n=5)

['chiang kai shek',
 'generalissimo chiang kai',
 'polish provisional government',
 'plain common sense',
 'at pearl harbor',
 'united mine workers',
 'my fellow americans',
 'times greater than',
 'on pearl harbor',
 'war labor board',
 'here back home',
 'woman and child',
 'commander in chief',
 'does not mean',
 'i am confident',
 'two years ago',
 'let us remember',
 'than ever before',
 'cairo and teheran',
 'the white house']

# Vietnam War

## Johnson

In [26]:
start_date = '1964-08-02'
end_date = '1975-04-30'
pres = 'Lyndon B. Johnson'

In [27]:
doc = filt_docs(transcripts, pres, start_date, end_date)

In [28]:
bigram_finder(doc, m=20, n=5)

['los angeles',
 'chi minh',
 'ho chi',
 'abraham lincoln',
 'andrew jackson',
 'de gaulle',
 'infant mortality',
 'johns hopkins',
 'random selection',
 'o clock',
 'rent supplements',
 'core unemployed',
 'cuban missile',
 'guerrilla warfare',
 'sam rayburn',
 'san antonio',
 'san francisco',
 'broadly based',
 'constituent assembly',
 'ranks 15th']

In [29]:
trigram_finder(doc, m=20, n=5)

['ho chi minh',
 'cuban missile crisis',
 'infant mortality rate',
 'hard core unemployed',
 'ranks 15th among',
 'gross national product',
 'john f kennedy',
 'test ban treaty',
 'asian development bank',
 'international monetary system',
 'earliest possible date',
 'at johns hopkins',
 'harry s truman',
 'safe streets act',
 'fiscal 1969 budget',
 'federal reserve board',
 'earliest possible moment',
 'extending back through',
 'good afternoon ladies',
 'foreign relations committee']

## Nixon

In [30]:
pres = 'Richard M. Nixon'

In [31]:
doc = filt_docs(transcripts, pres, start_date, end_date)

In [32]:
bigram_finder(doc, m=20, n=5)

['internationally supervised',
 'st clair',
 'abraham lincoln',
 'impeachable offense',
 'd c',
 'revenue sharing',
 'supreme court',
 '200th anniversary',
 'final analysis',
 'grand jury',
 'dr kissinger',
 'treaty commitments',
 'cease fire',
 'march 21',
 'conference table',
 'middle east',
 'super powers',
 'southeast asia',
 'attorney general',
 'special prosecutor']

In [33]:
trigram_finder(doc, m=20, n=5)

['washington d c',
 'mr st clair',
 '5 1 2',
 'an impeachable offense',
 'white house staff',
 'white house counsel',
 'house judiciary committee',
 'prosperity without inflation',
 'our treaty commitments',
 'on march 21',
 'turning away from',
 'mr speaker mr',
 'commander in chief',
 '1 2 years',
 'go forward together',
 'new attorney general',
 'q mr president',
 'a cease fire',
 'the supreme court',
 'my fellow americans']

# War on Terror

## Bush

In [34]:
start_date = '2001-09-11'
end_date = '2011-09-11'
pres = 'George W. Bush'

In [35]:
doc = filt_docs(transcripts, pres, start_date, end_date)

In [36]:
bigram_finder(doc, m=20, n=5)

['bin laden',
 'pell grants',
 'torture chambers',
 'ronald reagan',
 'red cross',
 'sustainable cease',
 'saudi arabia',
 'prime minister',
 'wall street',
 'palestinian territories',
 'distinguished guests',
 'elect obama',
 'auto industry',
 'd c',
 'holy land',
 'loved ones',
 'embryonic stem',
 'oval office',
 'less dependent',
 'speaker vice']

In [37]:
trigram_finder(doc, m=20, n=5)

['sustainable cease fire',
 'washington d c',
 'embryonic stem cell',
 'mr speaker vice',
 'embryonic stem cells',
 'vice president cheney',
 'stem cell research',
 'temporary worker program',
 'president elect obama',
 'u n inspectors',
 'speaker vice president',
 'medical liability reform',
 'usa freedom corps',
 'both political parties',
 'border patrol agents',
 'may god bless',
 'health savings accounts',
 'prescription drug coverage',
 'child left behind',
 'security council resolutions']

## Obama

In [38]:
pres = 'Barack Obama'

In [39]:
doc = filt_docs(transcripts, pres, start_date, end_date)

In [40]:
bigram_finder(doc, m=20, n=5)

['greenhouse gases',
 'mitch mcconnell',
 'holy koran',
 'natural gas',
 'status quo',
 'eastern europe',
 'bottom line',
 'supreme court',
 'judge sotomayor',
 'u n',
 'violent extremism',
 '9 11',
 'bin laden',
 'loved ones',
 'osama bin',
 'gulf coast',
 'safe havens',
 'preexisting conditions',
 'audience member',
 'wall street']

In [41]:
trigram_finder(doc, m=20, n=5)

['osama bin laden',
 'vice president biden',
 'congressional budget office',
 'small business owner',
 'muslim majority countries',
 'small business owners',
 'may god bless',
 'after 9 11',
 'middle class families',
 'innocent men women',
 'commander in chief',
 'round of applause',
 'afghan security forces',
 'hire new workers',
 'taliban s momentum',
 'be held accountable',
 'and mitch mcconnell',
 'against al qaeda',
 'boehner and mitch',
 'make no mistake']