This file calculates part-of-speech counts using spacy

In [1]:
import spacy
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
nlp = spacy.load("en_core_web_sm")

In [10]:
quotes = pd.read_csv('drive/MyDrive/data2/mlk131quotes.csv', delimiter = '|', header = None)
quotes = quotes[0].tolist()
quotes = ' '.join(quotes)

In [5]:
dfmlkwiki = pd.read_csv("drive/MyDrive/data/coding_prez/mlkwiki.csv", delimiter='|', header=None, names=['sentence', 'label'])
mlkwikistr = ' '.join(dfmlkwiki.sentence.values.astype('str'))
dfbirmjail = pd.read_csv("drive/MyDrive/data/coding_prez/birminghamjail.csv", delimiter='|', header=None, names=['sentence', 'label'])
birmjailstr = ' '.join(dfbirmjail.sentence.values.astype('str'))

In [33]:
def tag_sents_pos(final_string):
    """
    function which replicates NLTK pos tagging on sentences.
    """
    doc = nlp(final_string)
    pos_sents = []
    for sent in doc.sents:
        pos_sent = []
        for token in sent:
            pos_sent.append((token.text, token.pos_))
        pos_sents.append(pos_sent)
    
    return pos_sents

In [31]:
quotespos = tag_sents_pos(quotes)
wikipos = tag_sents_pos(mlkwikistr)
birmjailpos = tag_sents_pos(birmjailstr)
# quotespos
# birmjailpos

In [48]:
def top_pos(tokenpos, postype):
    '''get top token pos'''
    count = {}
    tokens = [token for sentences in tokenpos for token in sentences]
    for token in tokens:
        if token[1] == postype:
            if token not in count:
                count[token] = 0
            count[token] += 1
    return sorted(count.items(), key = lambda x: x[1], reverse = True)

In [62]:
print('Quotes VERBS')
top_pos(quotespos, 'VERB')[:20]

Quotes VERBS


[(('can', 'VERB'), 21),
 (('must', 'VERB'), 18),
 (('will', 'VERB'), 16),
 (('love', 'VERB'), 6),
 (('make', 'VERB'), 5),
 (('walk', 'VERB'), 5),
 (('live', 'VERB'), 5),
 (('should', 'VERB'), 4),
 (('serve', 'VERB'), 4),
 (('change', 'VERB'), 4),
 (('become', 'VERB'), 4),
 (('take', 'VERB'), 4),
 (('think', 'VERB'), 4),
 (('rise', 'VERB'), 3),
 (('find', 'VERB'), 3),
 (('hate', 'VERB'), 3),
 (('forgive', 'VERB'), 3),
 (('say', 'VERB'), 3),
 (('see', 'VERB'), 3),
 (('ca', 'VERB'), 3)]

In [60]:
print('Wiki VERBS')
top_pos(wikipos, 'VERB')[:20]

Wiki VERBS


[(('would', 'VERB'), 49),
 (('said', 'VERB'), 29),
 (('became', 'VERB'), 22),
 (('stated', 'VERB'), 21),
 (('will', 'VERB'), 20),
 (('could', 'VERB'), 20),
 (('made', 'VERB'), 19),
 (('called', 'VERB'), 19),
 (('including', 'VERB'), 15),
 (('led', 'VERB'), 12),
 (('began', 'VERB'), 12),
 (('see', 'VERB'), 12),
 (('known', 'VERB'), 12),
 (('must', 'VERB'), 12),
 (('say', 'VERB'), 12),
 (('took', 'VERB'), 11),
 (('saying', 'VERB'), 11),
 (('want', 'VERB'), 11),
 (('See', 'VERB'), 10),
 (('wrote', 'VERB'), 10)]

In [58]:
print('Birmingham Jail VERBS')
top_pos(birmjailpos, 'VERB')[:20]

Birmingham Jail VERBS


[(('will', 'VERB'), 34),
 (('would', 'VERB'), 26),
 (('can', 'VERB'), 24),
 (('must', 'VERB'), 19),
 (('see', 'VERB'), 19),
 (('say', 'VERB'), 11),
 (('made', 'VERB'), 10),
 (('give', 'VERB'), 8),
 (('come', 'VERB'), 8),
 (('understand', 'VERB'), 7),
 (('said', 'VERB'), 6),
 (('set', 'VERB'), 5),
 (('decided', 'VERB'), 5),
 (('felt', 'VERB'), 5),
 (('could', 'VERB'), 5),
 (('used', 'VERB'), 5),
 (('may', 'VERB'), 5),
 (('oppressed', 'VERB'), 5),
 (('find', 'VERB'), 5),
 (('becomes', 'VERB'), 5)]

In [63]:
print('Quotes ADJ')
top_pos(quotespos, 'ADJ')[:20]

Quotes ADJ


[(('great', 'ADJ'), 6),
 (('devoid', 'ADJ'), 5),
 (('right', 'ADJ'), 5),
 (('creative', 'ADJ'), 3),
 (('internal', 'ADJ'), 3),
 (('external', 'ADJ'), 3),
 (('physical', 'ADJ'), 3),
 (('spiritual', 'ADJ'), 2),
 (('important', 'ADJ'), 2),
 (('absolute', 'ADJ'), 2),
 (('emotional', 'ADJ'), 2),
 (('whole', 'ADJ'), 2),
 (('deep', 'ADJ'), 2),
 (('human', 'ADJ'), 2),
 (('small', 'ADJ'), 2),
 (('bright', 'ADJ'), 2),
 (('poor', 'ADJ'), 2),
 (('dedicated', 'ADJ'), 2),
 (('free', 'ADJ'), 2),
 (('American', 'ADJ'), 2)]

In [64]:
print('Wiki ADJ')
top_pos(wikipos, 'ADJ')[:20]

Wiki ADJ


[(('civil', 'ADJ'), 85),
 (('American', 'ADJ'), 37),
 (('other', 'ADJ'), 33),
 (('black', 'ADJ'), 31),
 (('white', 'ADJ'), 26),
 (('nonviolent', 'ADJ'), 24),
 (('first', 'ADJ'), 22),
 (('racial', 'ADJ'), 20),
 (('social', 'ADJ'), 19),
 (('many', 'ADJ'), 17),
 (('public', 'ADJ'), 17),
 (('African', 'ADJ'), 16),
 (('national', 'ADJ'), 15),
 (('political', 'ADJ'), 14),
 (('-', 'ADJ'), 14),
 (('Main', 'ADJ'), 12),
 (('economic', 'ADJ'), 12),
 (('several', 'ADJ'), 11),
 (('high', 'ADJ'), 11),
 (('old', 'ADJ'), 10)]

In [65]:
print('Birmingham Jail ADJ')
top_pos(birmjailpos, 'ADJ')[:20]

Birmingham Jail ADJ


[(('nonviolent', 'ADJ'), 16),
 (('white', 'ADJ'), 16),
 (('unjust', 'ADJ'), 16),
 (('direct', 'ADJ'), 15),
 (('other', 'ADJ'), 9),
 (('many', 'ADJ'), 7),
 (('moral', 'ADJ'), 7),
 (('old', 'ADJ'), 7),
 (('black', 'ADJ'), 6),
 (('little', 'ADJ'), 6),
 (('racial', 'ADJ'), 6),
 (('long', 'ADJ'), 6),
 (('wrong', 'ADJ'), 6),
 (('few', 'ADJ'), 6),
 (('public', 'ADJ'), 5),
 (('religious', 'ADJ'), 5),
 (('good', 'ADJ'), 5),
 (('necessary', 'ADJ'), 5),
 (('social', 'ADJ'), 5),
 (('young', 'ADJ'), 5)]

In [66]:
print('Quotes NOUNS')
top_pos(quotespos, 'NOUN')[:20]

Quotes NOUNS


[(('man', 'NOUN'), 13),
 (('time', 'NOUN'), 10),
 (('power', 'NOUN'), 9),
 (('justice', 'NOUN'), 8),
 (('love', 'NOUN'), 7),
 (('people', 'NOUN'), 6),
 (('dream', 'NOUN'), 5),
 (('men', 'NOUN'), 5),
 (('life', 'NOUN'), 4),
 (('way', 'NOUN'), 4),
 (('religion', 'NOUN'), 4),
 (('method', 'NOUN'), 4),
 (('day', 'NOUN'), 4),
 (('nation', 'NOUN'), 4),
 (('goal', 'NOUN'), 4),
 (('peace', 'NOUN'), 4),
 (('war', 'NOUN'), 4),
 (('soul', 'NOUN'), 3),
 (('Love', 'NOUN'), 3),
 (('being', 'NOUN'), 3)]

In [67]:
print('Wiki NOUNS')
top_pos(wikipos, 'NOUN')[:20]

Wiki NOUNS


[(('rights', 'NOUN'), 99),
 (('movement', 'NOUN'), 51),
 (('speech', 'NOUN'), 27),
 (('time', 'NOUN'), 27),
 (('day', 'NOUN'), 25),
 (('church', 'NOUN'), 23),
 (('father', 'NOUN'), 23),
 (('years', 'NOUN'), 22),
 (('assassination', 'NOUN'), 21),
 (('life', 'NOUN'), 21),
 (('death', 'NOUN'), 21),
 (('school', 'NOUN'), 21),
 (('march', 'NOUN'), 21),
 (('people', 'NOUN'), 20),
 (('leaders', 'NOUN'), 20),
 (('campaign', 'NOUN'), 20),
 (('year', 'NOUN'), 18),
 (('children', 'NOUN'), 17),
 (('role', 'NOUN'), 14),
 (('police', 'NOUN'), 14)]

In [68]:
print('Birmingham Jail NOUNS')
top_pos(birmjailpos, 'NOUN')[:20]

Birmingham Jail NOUNS


[(('law', 'NOUN'), 20),
 (('action', 'NOUN'), 19),
 (('time', 'NOUN'), 17),
 (('people', 'NOUN'), 17),
 (('segregation', 'NOUN'), 16),
 (('church', 'NOUN'), 15),
 (('freedom', 'NOUN'), 14),
 (('laws', 'NOUN'), 12),
 (('justice', 'NOUN'), 11),
 (('community', 'NOUN'), 10),
 (('brothers', 'NOUN'), 10),
 (('day', 'NOUN'), 9),
 (('extremist', 'NOUN'), 9),
 (('leaders', 'NOUN'), 8),
 (('city', 'NOUN'), 8),
 (('fact', 'NOUN'), 8),
 (('tension', 'NOUN'), 8),
 (('years', 'NOUN'), 8),
 (('sense', 'NOUN'), 8),
 (('jail', 'NOUN'), 7)]