# Part of Speech Tagging

## Install NLTK POS Tagger 

In [146]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package punkt to /home/hhhuang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/hhhuang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Test

In [147]:
text = "The dog eats the big hotdog."
tokens = word_tokenize(text)
print(nltk.pos_tag(tokens))


[('The', 'DT'), ('dog', 'NN'), ('eats', 'VBZ'), ('the', 'DT'), ('big', 'JJ'), ('hotdog', 'NN'), ('.', '.')]


In [148]:
print(nltk.pos_tag(word_tokenize("The book is written by my father.")))

[('The', 'DT'), ('book', 'NN'), ('is', 'VBZ'), ('written', 'VBN'), ('by', 'IN'), ('my', 'PRP$'), ('father', 'NN'), ('.', '.')]


In [149]:
print(nltk.pos_tag(word_tokenize("My father has written more than ten books.")))

[('My', 'PRP$'), ('father', 'NN'), ('has', 'VBZ'), ('written', 'VBN'), ('more', 'JJR'), ('than', 'IN'), ('ten', 'JJ'), ('books', 'NNS'), ('.', '.')]


Full list of the Penn POS tags
https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

Perform POS tagging for all tokens in the corpus. 

In [150]:
with open("../text_mining_2/corpus.txt", encoding="utf8") as fin:
    text = fin.read()
print("Number of characters: %d" % len(text))

Number of characters: 75346


In [151]:
tokens = word_tokenize(text)
tokens_with_tag = nltk.pos_tag(tokens)

Find the most frequent nouns

In [152]:
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
    if tag == 'NN':
        noun_counts[word] += 1
        
print(noun_counts.most_common(20))

[('class', 104), ('bourgeoisie', 89), ('society', 72), ('bourgeois', 69), ('proletariat', 62), ('property', 55), ('production', 52), ('existence', 30), ('labor', 30), ('development', 28), ('industry', 27), ('capital', 22), ('form', 21), ('movement', 19), ('character', 17), ('struggle', 17), ('country', 15), ('abolition', 15), ('time', 14), ('revolution', 14)]


Now we can always convert words into lower case, excepting proper nouns.

In [153]:
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
    if tag == 'NN':
        noun_counts[word.lower()] += 1
        
print(noun_counts.most_common(20))

[('class', 104), ('bourgeoisie', 89), ('society', 73), ('bourgeois', 69), ('proletariat', 62), ('property', 56), ('production', 52), ('existence', 30), ('labor', 30), ('development', 28), ('industry', 27), ('capital', 24), ('form', 21), ('abolition', 20), ('movement', 19), ('character', 17), ('struggle', 17), ('country', 15), ('time', 14), ('revolution', 14)]


Explore other kinds of part of speech tags

In [154]:
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
    if tag == 'VB':
        noun_counts[word.lower()] += 1
        
print(noun_counts.most_common(20))

[('be', 41), ('have', 8), ('do', 8), ('lose', 5), ('increase', 5), ('introduce', 5), ('attain', 4), ('thus', 4), ('let', 4), ('use', 3), ('vanish', 3), ('become', 3), ('take', 3), ('form', 3), ('acquire', 3), ('bring', 3), ('abolish', 3), ('live', 2), ('comprehend', 2), ('express', 2)]


In [155]:
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
    if tag == 'NNP' or tag == 'NNPS':
        noun_counts[word] += 1
        
print(noun_counts.most_common(20))

[('Communists', 23), ('Socialism', 21), ('Germany', 13), ('Communism', 12), ('France', 12), ('State', 11), ('England', 9), ('Communist', 7), ('Socialist', 6), ('_i.e._', 5), ('America', 5), ('AND', 4), ('Communistic', 3), ('I.', 3), ('Bourgeois', 3), ('II', 3), ('Hence', 3), ('Socialism_', 3), ('THE', 3), ('Modern', 3)]


In [156]:
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
    if tag[0] == 'V':
        noun_counts[word.lower()] += 1
        
print(noun_counts.most_common(20))

[('is', 138), ('has', 68), ('are', 59), ('have', 45), ('be', 41), ('was', 29), ('do', 18), ('been', 18), ('existing', 15), ('were', 14), ('had', 12), ('being', 11), ('working', 10), ('does', 9), ('become', 9), ('made', 8), ('created', 7), ('see', 7), ('becomes', 7), ('developed', 7)]


With lemmatization for better handle different forms of verbs.

Load WordNet Lemmatizer provided by NTLK

In [157]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [158]:
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
    if tag[0] == 'V':
        noun_counts[wordnet_lemmatizer.lemmatize(word.lower(), 'v')] += 1
        # ADJ (a), ADJ_SAT (s), ADV (r), NOUN (n) or VERB (v)

print(noun_counts.most_common(20))

[('be', 310), ('have', 128), ('do', 31), ('become', 21), ('exist', 20), ('take', 17), ('work', 14), ('create', 13), ('develop', 13), ('make', 13), ('see', 12), ('find', 11), ('lose', 10), ('destroy', 9), ('increase', 9), ('abolish', 9), ('give', 9), ('go', 9), ('carry', 8), ('require', 8)]


In [159]:
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
    if tag[0] == 'N':
        noun_counts[wordnet_lemmatizer.lemmatize(word.lower(), 'n')] += 1
        # ADJ (a), ADJ_SAT (s), ADV (r), NOUN (n) or VERB (v)

print(noun_counts.most_common(20))

[('class', 124), ('bourgeoisie', 91), ('society', 76), ('bourgeois', 75), ('proletariat', 64), ('condition', 59), ('property', 56), ('production', 53), ('industry', 35), ('communist', 34), ('relation', 32), ('mean', 30), ('existence', 30), ('labor', 30), ('country', 28), ('form', 28), ('socialism', 28), ('development', 28), ('state', 24), ('capital', 24)]


In [160]:
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
    if tag[0] == 'R':
        noun_counts[word.lower()] += 1
        # ADJ (a), ADJ_SAT (s), ADV (r), NOUN (n) or VERB (v)

print(noun_counts.most_common(20))

[('not', 55), ('more', 32), ('only', 29), ('so', 27), ('up', 25), ('therefore', 21), ('most', 17), ('also', 13), ('away', 13), ('longer', 12), ('then', 11), ('no', 10), ('even', 10), ('now', 9), ('generally', 9), ('out', 9), ('just', 9), ('ever', 9), ('everywhere', 9), ('thus', 8)]


In [161]:
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
    if tag[0] == 'R':
        noun_counts[wordnet_lemmatizer.lemmatize(word.lower(), 'r')] += 1
        # ADJ (a), ADJ_SAT (s), ADV (r), NOUN (n) or VERB (v)

print(noun_counts.most_common(20))

[('not', 55), ('more', 32), ('only', 29), ('so', 27), ('up', 25), ('therefore', 21), ('most', 17), ('also', 13), ('away', 13), ('longer', 12), ('far', 11), ('then', 11), ('no', 10), ('even', 10), ('now', 9), ('generally', 9), ('out', 9), ('just', 9), ('ever', 9), ('everywhere', 9)]


## Mining Specific Distant Collocations

Back to last week

In [162]:
window_size = 9

word_pair_counts = Counter()
word_pair_distance_counts = Counter()
for i in range(len(tokens) - 1):
    for distance in range(1, window_size):
        if i + distance < len(tokens):
            w1 = tokens[i]
            w2 = tokens[i + distance]
            word_pair_distance_counts[(w1, w2, distance)] += 1
            word_pair_counts[(w1, w2)] += 1

for (w1, w2, distance), c in word_pair_distance_counts.most_common(20):
    print("%s\t%s\t%d\t%d" % (w1, w2, distance, c))

the	of	2	279
of	the	1	242
the	the	3	154
,	the	2	118
,	and	1	111
,	,	2	109
the	of	3	105
the	,	5	101
.	The	1	100
,	,	7	98
,	,	4	97
of	,	3	97
the	the	7	95
the	,	8	94
the	the	6	92
,	the	4	92
,	,	6	92
the	the	4	92
the	,	2	91
of	,	2	91


In [163]:
window_size = 9

word_pair_counts = Counter()
word_pair_distance_counts = Counter()
for i in range(len(tokens_with_tag) - 1):
    w1, t1 = tokens_with_tag[i]
    if t1[0] != 'V':
        continue
    w1 = wordnet_lemmatizer.lemmatize(w1.lower(), 'v')
        
    for distance in range(1, window_size):
        if i + distance < len(tokens_with_tag):
            w2, t2 = tokens_with_tag[i + distance]
            if t2[0] == 'N':
                w2 = wordnet_lemmatizer.lemmatize(w2.lower(), 'n')
                word_pair_distance_counts[(w1, w2, distance)] += 1
                word_pair_counts[(w1, w2)] += 1

for (w1, w2, distance), c in word_pair_distance_counts.most_common(20):
    print("%s\t%s\t%d\t%d" % (w1, w2, distance, c))

work	class	1	10
be	class	4	6
exist	society	1	5
be	class	6	5
be	society	5	4
rule	class	1	4
work	party	2	4
be	class	7	4
be	class	3	4
pave	way	2	3
be	bourgeoisie	8	3
be	hand	4	3
put	end	2	3
be	mean	4	3
be	bourgeois	5	3
be	condition	6	3
lose	character	3	3
exist	state	1	3
be	bourgeois	6	3
appropriate	product	2	3


Compute the mean distance of each verb-noun pair.

In [164]:
pair_mean_distances = Counter()

for (w1, w2, distance), c in word_pair_distance_counts.most_common():
    if word_pair_counts[(w1, w2)] > 1:
        pair_mean_distances[(w1, w2)] += distance * (c / word_pair_counts[(w1, w2)])


Show the longest, middle, and shortest pairs. 

In [165]:
for (w1, w2), distance in pair_mean_distances.most_common(20):
    print("%s\t%s\t%f\t%d" % (w1, w2, distance, word_pair_counts[(w1, w2)]))

introduce	bourgeoisie	8.000000	2
be	case	8.000000	2
have	ruling	7.500000	2
be	communism	7.500000	2
be	interest	7.500000	2
be	moment	7.500000	2
be	order	7.500000	2
have	force	7.500000	2
do	population	7.500000	2
have	communism	7.500000	2
have	master	7.500000	2
be	action	7.000000	2
be	manufacturer	7.000000	2
be	development	7.000000	3
have	place	7.000000	2
have	dissolution	7.000000	2
be	relation	7.000000	3
see	production	7.000000	2
leave	man	7.000000	2
be	wife	7.000000	2


In [166]:
num_pairs = len(pair_mean_distances)
mid = num_pairs // 2
for (w1, w2), distance in pair_mean_distances.most_common()[mid-20:mid+20]:
    print("%s\t%s\t%f\t%d" % (w1, w2, distance, word_pair_counts[(w1, w2)]))

be	property	5.333333	9
be	capitalist	5.333333	3
be	mean	5.333333	6
be	laborer	5.333333	3
have	feudal	5.333333	3
have	character	5.333333	3
be	class	5.238095	21
be	struggle	5.166667	6
be	capital	5.166667	6
be	condition	5.100000	10
be	time	5.000000	5
be	bare	5.000000	2
deaden	class	5.000000	2
convert	property	5.000000	2
be	family	5.000000	2
be	hand	5.000000	4
be	idea	5.000000	4
have	population	5.000000	3
see	antagonism	5.000000	2
abolish	property	5.000000	3
replace	education	5.000000	2
do	property	5.000000	5
keep	laborer	5.000000	2
be	advance	5.000000	2
compel	proletariat	5.000000	2
exist	bourgeoisie	5.000000	2
have	part	5.000000	2
create	property	5.000000	2
be	way	5.000000	3
increase	labor	5.000000	2
do	history	5.000000	2
have	dependent	5.000000	2
be	slave	5.000000	2
be	character	4.750000	4
have	mean	4.750000	4
be	man	4.750000	4
have	hand	4.600000	5
create	force	4.500000	2
be	attempt	4.500000	2
be	party	4.500000	2


In [167]:
for (w1, w2), distance in pair_mean_distances.most_common()[-20:]:
    print("%s\t%s\t%f\t%d" % (w1, w2, distance, word_pair_counts[(w1, w2)]))

supply	proletariat	2.000000	2
stand	face	2.000000	2
have	meaning	2.000000	2
keep	pace	2.000000	2
go	hand	2.000000	2
exist	property	2.000000	2
pave	way	2.000000	3
work	class	2.000000	12
continue	existence	2.000000	2
appropriate	product	1.600000	5
have	individuality	1.500000	2
create	condition	1.500000	2
take	place	1.500000	2
increase	capital	1.500000	2
exist	society	1.333333	6
introduce	community	1.000000	2
bourgeois	society	1.000000	2
rule	class	1.000000	4
lose	sight	1.000000	2
rise	bourgeoisie	1.000000	2


Find out the meaningful verb/noun pairs with deviation.

In [168]:
pair_deviations = Counter()
for (w1, w2, distance), c in word_pair_distance_counts.most_common():
    if word_pair_counts[(w1, w2)] > 1:
        pair_deviations[(w1, w2)] += c * ((distance - pair_mean_distances[(w1, w2)]) ** 2)
    
for (w1, w2), dev_tmp in pair_deviations.most_common():
    s_2 = dev_tmp / (word_pair_counts[(w1, w2)] - 1)
    pair_deviations[(w1, w2)] = s_2 ** 0.5
    
for (w1, w2), dev in pair_deviations.most_common()[-20:]:
    print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))

be	antagonism	4.000000	0.000000	2
admit	case	3.000000	0.000000	2
base	antagonism	3.000000	0.000000	2
introduce	community	1.000000	0.000000	2
dominate	society	6.000000	0.000000	2
compel	proletariat	5.000000	0.000000	2
be	air	4.000000	0.000000	2
have	meaning	2.000000	0.000000	2
get	hand	3.000000	0.000000	2
be	case	8.000000	0.000000	2
have	part	5.000000	0.000000	2
bourgeois	society	1.000000	0.000000	2
see	proletariat	7.000000	0.000000	2
keep	pace	2.000000	0.000000	2
have	man	7.000000	0.000000	2
rule	class	1.000000	0.000000	4
lose	sight	1.000000	0.000000	2
pave	way	2.000000	0.000000	3
rise	bourgeoisie	1.000000	0.000000	2
have	industry	7.000000	0.000000	2


Filter out the stopwords.

In [169]:
from nltk.corpus import stopwords
stopword_list = stopwords.words('english')

pair_deviations = Counter()
for (w1, w2, distance), c in word_pair_distance_counts.most_common():
    if w1 in stopword_list:
        continue
    if word_pair_counts[(w1, w2)] > 1:
        pair_deviations[(w1, w2)] += c * ((distance - pair_mean_distances[(w1, w2)]) ** 2)
    
for (w1, w2), dev_tmp in pair_deviations.most_common():
    s_2 = dev_tmp / (word_pair_counts[(w1, w2)] - 1)
    pair_deviations[(w1, w2)] = s_2 ** 0.5
    
for (w1, w2), dev in pair_deviations.most_common()[-20:]:
    print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))

introduce	community	1.000000	0.000000	2
lose	character	3.000000	0.000000	3
dominate	society	6.000000	0.000000	2
compel	proletariat	5.000000	0.000000	2
transform	property	3.000000	0.000000	2
get	hand	3.000000	0.000000	2
put	end	2.000000	0.000000	3
introduce	bourgeoisie	8.000000	0.000000	2
bourgeois	society	1.000000	0.000000	2
see	proletariat	7.000000	0.000000	2
keep	pace	2.000000	0.000000	2
supply	proletariat	2.000000	0.000000	2
rule	class	1.000000	0.000000	4
determine	condition	4.000000	0.000000	2
lose	sight	1.000000	0.000000	2
pave	way	2.000000	0.000000	3
introduce	woman	3.000000	0.000000	2
rise	bourgeoisie	1.000000	0.000000	2
organize	class	4.000000	0.000000	2
admit	case	3.000000	0.000000	2


Further filter out the low frequent pairs. 

In [170]:
pair_deviations = Counter()
for (w1, w2, distance), c in word_pair_distance_counts.most_common():
    if w1 in stopword_list:
        continue
    if word_pair_counts[(w1, w2)] > 2:
        pair_deviations[(w1, w2)] += c * ((distance - pair_mean_distances[(w1, w2)]) ** 2)
    
for (w1, w2), dev_tmp in pair_deviations.most_common():
    s_2 = dev_tmp / (word_pair_counts[(w1, w2)] - 1)
    pair_deviations[(w1, w2)] = s_2 ** 0.5
    
for (w1, w2), dev in pair_deviations.most_common()[-20:]:
    print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))

find	work	2.666667	2.886751	3
abolish	property	5.000000	2.645751	3
join	class	5.666667	2.516611	3
fight	bourgeoisie	3.800000	2.387467	5
work	class	2.000000	2.374103	12
represent	interest	3.666667	2.081666	3
mean	bourgeois	4.333333	2.081666	3
exist	thing	4.000000	1.732051	3
intend	property	6.666667	1.154701	3
revolutionize	production	3.333333	1.154701	3
concentrate	hand	3.666667	1.154701	3
attain	end	3.000000	1.000000	3
exist	society	1.333333	0.816497	6
produce	product	3.500000	0.577350	4
appropriate	product	1.600000	0.547723	5
work	party	2.000000	0.000000	4
rule	class	1.000000	0.000000	4
put	end	2.000000	0.000000	3
lose	character	3.000000	0.000000	3
pave	way	2.000000	0.000000	3


General method for distant collocation mining. 

In [171]:
# A handy lemmatizer 
# WordNet Style: ADJ (a), ADJ_SAT (s), ADV (r), NOUN (n) or VERB (v)
# Penn Style: ADJ (J*), ADJ_SAT (J*), ADV (R*), NOUN (N*), or VERB (V*) 
def lemmatize_verbose(word, pos):
    if pos[0] == 'J':
        return wordnet_lemmatizer.lemmatize(word, 'a')
    elif pos[0] == 'R':
        return wordnet_lemmatizer.lemmatize(word, 'r')
    elif pos[0] == 'N':
        return wordnet_lemmatizer.lemmatize(word, 'n')
    elif pos[0] == 'V':
        return wordnet_lemmatizer.lemmatize(word, 'v')
    else:
        return word
    

def lemmatize_shorter(word, pos):
    if pos[0] == 'J':
        pos = 'a'
    elif pos[0] == 'R':
        pos = 'r'
    elif pos[0] == 'N':
        pos = 'n'
    elif pos[0] == 'V':
        pos = 'v'
    else:
        return word
    return wordnet_lemmatizer.lemmatize(word, pos)


def lemmatize_smarter(word, pos):
    if pos[0] in ['R', 'N', 'V']:
        pos = pos[0].lower()
    elif pos[0] == 'J':
        pos = 'a'
    else:
        return word
    return wordnet_lemmatizer.lemmatize(word, pos)


# Recommended implementation.
def lemmatize(word, pos):
    mapping = {'J': 'a', 'R': 'r', 'N': 'n', 'V': 'v'}
    if pos[0] in mapping:
        return wordnet_lemmatizer.lemmatize(word, mapping[pos[0]])
    return word


Count all pairs.
   

In [172]:
def distant_collocations(tokens_with_tag, pos1, pos2, min_cut=2, window_size=9):
    word_pair_counts = Counter()
    word_pair_distance_counts = Counter()
    for i in range(len(tokens_with_tag) - 1):
        w1, t1 = tokens_with_tag[i]
        if not t1.startswith(pos1):
            continue
        w1 = lemmatize(w1.lower(), t1)
        for distance in range(1, window_size):
            if i + distance < len(tokens_with_tag):
                w2, t2 = tokens_with_tag[i + distance]
                if t2.startswith(pos2):
                    w2 = lemmatize(w2.lower(), t2)
                    word_pair_distance_counts[(w1, w2, distance)] += 1
                    word_pair_counts[(w1, w2)] += 1
    
    pair_mean_distances = Counter()

    for (w1, w2, distance), c in word_pair_distance_counts.most_common():
        if word_pair_counts[(w1, w2)] > 1:
            pair_mean_distances[(w1, w2)] += distance * (c / word_pair_counts[(w1, w2)])

    pair_deviations = Counter()
    for (w1, w2, distance), c in word_pair_distance_counts.most_common():
        if w1 in stopword_list:
            continue
        if word_pair_counts[(w1, w2)] > min_cut:
            pair_deviations[(w1, w2)] += c * ((distance - pair_mean_distances[(w1, w2)]) ** 2)
    
    for (w1, w2), dev_tmp in pair_deviations.most_common():
        s_2 = dev_tmp / (word_pair_counts[(w1, w2)] - 1)
        pair_deviations[(w1, w2)] = s_2 ** 0.5
    
    return pair_deviations

In [173]:
collocations = distant_collocations(tokens_with_tag, 'V', 'N')

for (w1, w2), dev in collocations.most_common()[-20:]:
    print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))        


find	work	2.666667	2.886751	3
abolish	property	5.000000	2.645751	3
join	class	5.666667	2.516611	3
fight	bourgeoisie	3.800000	2.387467	5
work	class	2.000000	2.374103	12
represent	interest	3.666667	2.081666	3
mean	bourgeois	4.333333	2.081666	3
exist	thing	4.000000	1.732051	3
intend	property	6.666667	1.154701	3
revolutionize	production	3.333333	1.154701	3
concentrate	hand	3.666667	1.154701	3
attain	end	3.000000	1.000000	3
exist	society	1.333333	0.816497	6
produce	product	3.500000	0.577350	4
appropriate	product	1.600000	0.547723	5
work	party	2.000000	0.000000	4
rule	class	1.000000	0.000000	4
put	end	2.000000	0.000000	3
lose	character	3.000000	0.000000	3
pave	way	2.000000	0.000000	3


In [174]:
collocations = distant_collocations(tokens_with_tag, 'N', 'N')

for (w1, w2), dev in collocations.most_common()[-20:]:
    print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))        


mean	subsistence	0.000000	0.000000	0
mode	production	0.000000	0.000000	0
member	society	0.000000	0.000000	0
state	society	0.000000	0.000000	0
condition	life	0.000000	0.000000	0
bourgeois	society	1.000000	0.000000	2
relation	production	0.000000	0.000000	0
benefit	class	0.000000	0.000000	0
mean	communication	0.000000	0.000000	0
form	society	0.000000	0.000000	0
portion	bourgeoisie	0.000000	0.000000	0
division	labor	0.000000	0.000000	0
form	property	0.000000	0.000000	0
ruling	class	0.000000	0.000000	0
instrument	production	0.000000	0.000000	0
production	exchange	0.000000	0.000000	0
community	woman	0.000000	0.000000	0
disappearance	class	0.000000	0.000000	0
bourgeois	socialism	0.000000	0.000000	1
section	class	0.000000	0.000000	0


In [175]:
collocations = distant_collocations(tokens_with_tag, 'J', 'N')

for (w1, w2), dev in collocations.most_common()[-20:]:
    print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))        


political	bourgeoisie	0.000000	0.000000	0
private	property	0.000000	0.000000	0
productive	force	0.000000	0.000000	0
working	class	0.000000	0.000000	0
petty	bourgeois	0.000000	0.000000	0
undeveloped	state	0.000000	0.000000	0
modern	bourgeois	0.000000	0.000000	0
eternal	truth	0.000000	0.000000	0
bourgeois	production	0.000000	0.000000	1
french	revolution	0.000000	0.000000	0
absolute	monarchy	0.000000	0.000000	0
free	trade	0.000000	0.000000	0
feudal	society	0.000000	0.000000	0
historical	development	0.000000	0.000000	0
modern	bourgeoisie	0.000000	0.000000	0
middle	age	0.000000	0.000000	0
mere	production	0.000000	0.000000	0
political	supremacy	0.000000	0.000000	0
immense	majority	0.000000	0.000000	0
eighteenth	century	0.000000	0.000000	0


In [176]:
collocations = distant_collocations(tokens_with_tag, 'NNP', 'N')

for (w1, w2), dev in collocations.most_common()[-20:]:
    print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))        


true	socialism	0.000000	3.785939	1
communist	party	0.000000	2.863564	0
communist	communist	0.000000	2.645751	0
england	france	0.000000	2.516611	0
socialist	literature	0.000000	2.500000	0
communist	literature	0.000000	2.500000	0
communism	power	0.000000	1.527525	0
germany	bourgeoisie	0.000000	0.577350	0
socialist	communist	0.000000	0.000000	0


Implememnt a better lemmatizer for handling proper nouns (NNP / NNPS).

In [177]:
def lemmatize(word, pos):
    if not pos.startswith('NNP'):
        word = word.lower()
    mapping = {'J': 'a', 'R': 'r', 'N': 'n', 'V': 'v'}
    if pos[0] in mapping:
        return wordnet_lemmatizer.lemmatize(word, mapping[pos[0]])
    return word

And do not lower() the word in the main function anymore. 

In [178]:
def distant_collocations(tokens_with_tag, pos1, pos2, min_cut=2, window_size=9):
    word_pair_counts = Counter()
    word_pair_distance_counts = Counter()
    for i in range(len(tokens_with_tag) - 1):
        w1, t1 = tokens_with_tag[i]
        if not t1.startswith(pos1):
            continue
        w1 = lemmatize(w1, t1)
        for distance in range(1, window_size):
            if i + distance < len(tokens_with_tag):
                w2, t2 = tokens_with_tag[i + distance]
                if t2.startswith(pos2):
                    w2 = lemmatize(w2, t2)
                    word_pair_distance_counts[(w1, w2, distance)] += 1
                    word_pair_counts[(w1, w2)] += 1
    
    pair_mean_distances = Counter()

    for (w1, w2, distance), c in word_pair_distance_counts.most_common():
        if word_pair_counts[(w1, w2)] > 1:
            pair_mean_distances[(w1, w2)] += distance * (c / word_pair_counts[(w1, w2)])

    pair_deviations = Counter()
    for (w1, w2, distance), c in word_pair_distance_counts.most_common():
        if w1 in stopword_list:
            continue
        if word_pair_counts[(w1, w2)] > min_cut:
            pair_deviations[(w1, w2)] += c * ((distance - pair_mean_distances[(w1, w2)]) ** 2)
    
    for (w1, w2), dev_tmp in pair_deviations.most_common():
        s_2 = dev_tmp / (word_pair_counts[(w1, w2)] - 1)
        pair_deviations[(w1, w2)] = s_2 ** 0.5
    
    return pair_deviations

In [179]:
collocations = distant_collocations(tokens_with_tag, 'NNP', 'N')

for (w1, w2), dev in collocations.most_common()[-20:]:
    print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))        

Communist	literature	0.000000	2.886751	0
Socialist	literature	0.000000	2.886751	0
England	France	0.000000	2.516611	0
Communism	power	0.000000	1.527525	0
Communists	party	0.000000	1.154701	0
Germany	bourgeoisie	0.000000	0.577350	0
Socialist	Communist	0.000000	0.000000	0


In [180]:
collocations = distant_collocations(tokens_with_tag, 'V', 'N')

for (w1, w2), dev in collocations.most_common()[-20:]:
    print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))        

find	work	2.666667	2.886751	3
abolish	property	5.000000	2.645751	3
join	class	5.666667	2.516611	3
fight	bourgeoisie	3.800000	2.387467	5
work	class	2.000000	2.374103	12
represent	interest	3.666667	2.081666	3
mean	bourgeois	4.333333	2.081666	3
exist	thing	4.000000	1.732051	3
intend	property	6.666667	1.154701	3
revolutionize	production	3.333333	1.154701	3
concentrate	hand	3.666667	1.154701	3
attain	end	3.000000	1.000000	3
exist	society	1.333333	0.816497	6
produce	product	3.500000	0.577350	4
appropriate	product	1.600000	0.547723	5
work	party	2.000000	0.000000	4
rule	class	1.000000	0.000000	4
put	end	2.000000	0.000000	3
lose	character	3.000000	0.000000	3
pave	way	2.000000	0.000000	3
