<a href="https://colab.research.google.com/github/juanknebel/text-mining-2020/blob/master/5_collocations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%matplotlib inline 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from nltk import word_tokenize, sent_tokenize
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from gensim.models.phrases import Phrases, Phraser

In [0]:
import nltk
nltk.download('punkt')
nltk.download('reuters')
from nltk.corpus import reuters


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package reuters to /root/nltk_data...


In [0]:
documents = []
 
for fileid in reuters.fileids():
    documents.append(reuters.raw(fileid).lower())


In [0]:
documents[0]



In [0]:
tokens =  [w for doc in documents for w in word_tokenize(doc)]
tokens[:15]

['asian',
 'exporters',
 'fear',
 'damage',
 'from',
 'u.s.-japan',
 'rift',
 'mounting',
 'trade',
 'friction',
 'between',
 'the',
 'u.s.',
 'and',
 'japan']

In [0]:
# Bi-gramas
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)
# Filtra bigramas con frecuencia menor a 10
finder.apply_freq_filter(10)
# devuelve los "n" bigramas con mayor PMI
finder.nbest(bigram_measures.pmi, n = 50)

[('het', 'comite'),
 ('lago', 'agrio'),
 ('dar', 'es'),
 ('es', 'salaam'),
 ('hoare', 'govett'),
 ('corpus', 'christi'),
 ('paz', 'estenssoro'),
 ('corazon', 'aquino'),
 ('ay', 'expd-e'),
 ('lear', 'siegler'),
 ('l.f.', 'rothschild'),
 ('ranks', 'hovis'),
 ('hajime', 'tamura'),
 ('abu', 'dhabi'),
 ('poison', 'pill'),
 ('kleinwort', 'benson'),
 ('ind', 'ttl-f'),
 ('rjr', 'nabisco'),
 ('gates', 'learjet'),
 ('pro', 'forma'),
 ('margaret', 'thatcher'),
 ('carter', 'hawley'),
 ('canary', 'islands'),
 ('mcdonnell', 'douglas'),
 ('bra', 'kanon'),
 ('lord', 'abbett'),
 ('puerto', 'rico'),
 ('phelps', 'dodge'),
 ("'n", 'pak'),
 ('sao', 'paulo'),
 ('brace', 'jovanovich'),
 ('karl', 'otto'),
 ('marlin', 'fitzwater'),
 ('pizza', 'inn'),
 ('dean', 'witter'),
 ('buenos', 'aires'),
 ('costa', 'rica'),
 ('king', 'fahd'),
 ('del', 'este'),
 ('pl', '480'),
 ('hernandez', 'grisanti'),
 ('arturo', 'hernandez'),
 ('punta', 'del'),
 ('el', 'nino'),
 ('optional', 'origin'),
 ('du', 'pont'),
 ('drexel', 'bur

# Gensim

In [0]:
sentences =  [word_tokenize(sent) for sent in sent_tokenize("\n".join(documents).lower())]

In [0]:
sentences[:3]

[['asian',
  'exporters',
  'fear',
  'damage',
  'from',
  'u.s.-japan',
  'rift',
  'mounting',
  'trade',
  'friction',
  'between',
  'the',
  'u.s.',
  'and',
  'japan',
  'has',
  'raised',
  'fears',
  'among',
  'many',
  'of',
  'asia',
  "'s",
  'exporting',
  'nations',
  'that',
  'the',
  'row',
  'could',
  'inflict',
  'far-reaching',
  'economic',
  'damage',
  ',',
  'businessmen',
  'and',
  'officials',
  'said',
  '.'],
 ['they',
  'told',
  'reuter',
  'correspondents',
  'in',
  'asian',
  'capitals',
  'a',
  'u.s.',
  'move',
  'against',
  'japan',
  'might',
  'boost',
  'protectionist',
  'sentiment',
  'in',
  'the',
  'u.s.',
  'and',
  'lead',
  'to',
  'curbs',
  'on',
  'american',
  'imports',
  'of',
  'their',
  'products',
  '.'],
 ['but',
  'some',
  'exporters',
  'said',
  'that',
  'while',
  'the',
  'conflict',
  'would',
  'hurt',
  'them',
  'in',
  'the',
  'long-run',
  ',',
  'in',
  'the',
  'short-term',
  'tokyo',
  "'s",
  'loss',
  'm

In [0]:
sentences = [sent for sent in sentences if len(sent)>1]

In [0]:
collocations = Phrases(sentences=sentences, min_count=10,threshold=0.5,scoring='npmi') # threshold: minimo score aceptado

In [0]:
to_collocations = Phraser(collocations)

In [0]:
sent = 'new york is in united states of america. south africa and south america are in different continents'

In [0]:
to_collocations[word_tokenize(sent)]

['new_york',
 'is',
 'in',
 'united_states',
 'of',
 'america',
 '.',
 'south_africa',
 'and',
 'south',
 'america',
 'are',
 'in',
 'different',
 'continents']

In [0]:
df_collocations =pd.DataFrame([x for x in collocations.export_phrases(sentences)],columns=["bigram","score"])
df_collocations.shape

(113970, 2)

In [0]:
df_collocations.drop_duplicates().sort_values(by="score",ascending=False).head(50)

Unnamed: 0,bigram,score
39186,b'corpus christi',1.0
7608,b'lago agrio',1.0
8018,b'het comite',1.0
33044,b'bra kanon',1.0
1258,b'buenos aires',1.0
8,b'& lt',0.999734
170,b'lt ;',0.998407
5811,b'04/09/87 03/09/87',0.997678
22542,b'hoare govett',0.992659
8503,b'crazy eddie',0.992514
