In [4]:
import nltk
from nltk.corpus import reuters, stopwords
from nltk.stem import PorterStemmer
from nltk import bigrams, FreqDist
from nltk.tokenize import word_tokenize 
from nltk import bigrams

In [5]:
nltk.download('reuters')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [6]:
# get words
words = reuters.words()

print("Original Words:", words[:20])

Original Words: ['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.']


In [7]:
# delete non alphabetic(numbers, sign)
words = [word for word in words if word.isalpha()]

print("After Removing Special Characters:", words[:20])

After Removing Special Characters: ['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', 'S', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', 'S', 'And', 'Japan', 'has', 'raised']


In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
# delete stop words
# Stop words are like "is","the" ...
stop_words = set(stopwords.words('english'))

# remove one character word !!
words = [word for word in words if word.lower() not in stop_words and len(word) > 1]

print("After Removing Stop Words:", words[:20])

After Removing Stop Words: ['asian', 'export', 'fear', 'damag', 'japan', 'rift', 'mount', 'trade', 'friction', 'japan', 'rai', 'fear', 'among', 'mani', 'asia', 'export', 'nation', 'row', 'could', 'inflict']


In [35]:
# stemming --> warning: warn 
stemmer = PorterStemmer()
words = [stemmer.stem(word) for word in words]

print("After Stemming:", words[:20])

After Stemming: ['asian', 'export', 'fear', 'damag', 'japan', 'rift', 'mount', 'trade', 'friction', 'japan', 'rai', 'fear', 'among', 'mani', 'asia', 'export', 'nation', 'row', 'could', 'inflict']


In [36]:
# nltk.download('punkt')

In [37]:
# nltk.download('punkt_tab')

In [38]:
# tokenization
tokens = word_tokenize(" ".join(words))

print("After Tokenization:", tokens[:20])

After Tokenization: ['asian', 'export', 'fear', 'damag', 'japan', 'rift', 'mount', 'trade', 'friction', 'japan', 'rai', 'fear', 'among', 'mani', 'asia', 'export', 'nation', 'row', 'could', 'inflict']


In [39]:
# generate bigrams
def calculate_bigrams(tokens):
    bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]
    return bigrams

_bigrams = calculate_bigrams(tokens)
print("bigrams:", _bigrams[:20])

bigrams: [('asian', 'export'), ('export', 'fear'), ('fear', 'damag'), ('damag', 'japan'), ('japan', 'rift'), ('rift', 'mount'), ('mount', 'trade'), ('trade', 'friction'), ('friction', 'japan'), ('japan', 'rai'), ('rai', 'fear'), ('fear', 'among'), ('among', 'mani'), ('mani', 'asia'), ('asia', 'export'), ('export', 'nation'), ('nation', 'row'), ('row', 'could'), ('could', 'inflict'), ('inflict', 'far')]


In [44]:
# Calculate the frequency of bigrams

# count(w1,w2)
ferq_dict = {} 

for i in range(len(words) - 1):
    key = (words[i], words[i+1])
    if ferq_dict.get(key):
        ferq_dict[key] += 1
    else:
        ferq_dict[key] = 1
        
# ---------------------------------------------
#         if not temp in ferq_dict:
#             ferq_dict[key] = 1
#         else: 
#             ferq_dict[key] += 1
# ---------------------------------------------
        
        
# bigrams frequencies
bigrams_with_freq = [(bigram, freq) for bigram, freq in ferq_dict.items()]

print("Bigrams_frequencies:\n", bigrams_with_freq[:20])
print("-" * 100)  
print("-" * 100) 
print("Most Common Bigrams_frequencies:\n", bigram_freq.most_common(50))


Bigrams_frequencies:
 [(('asian', 'export'), 1), (('export', 'fear'), 3), (('fear', 'damag'), 1), (('damag', 'japan'), 2), (('japan', 'rift'), 1), (('rift', 'mount'), 1), (('mount', 'trade'), 4), (('trade', 'friction'), 20), (('friction', 'japan'), 2), (('japan', 'rai'), 2), (('rai', 'fear'), 6), (('fear', 'among'), 2), (('among', 'mani'), 2), (('mani', 'asia'), 1), (('asia', 'export'), 1), (('export', 'nation'), 5), (('nation', 'row'), 1), (('row', 'could'), 1), (('could', 'inflict'), 1), (('inflict', 'far'), 1)]
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Most Common Bigrams_frequencies:
 [(('mln', 'dlr'), 4838), (('vs', 'mln'), 3947), (('mln', 'vs'), 3921), (('ct', 'vs'), 3386), (('ct', 'net'), 2247), (('vs', 'ct'), 1934), (('billion', 'dlr'), 1914), (('vs', 'loss'), 1785), (('rev', 'mln'), 1619), (('net', 'vs'), 1579), (('shr'

In [56]:
from collections import defaultdict

# p(w2 | w1)= count(w1,w2) / count (w1)

# Calc  unigram frequencies =  count (w1)
unigram_freq = defaultdict(int)
for word in words:
    unigram_freq[word] += 1

# Calc bigram probabilities 
bigram_prob_matrix = {}

for bigram, freq_count in ferq_dict.items():
    w1 = bigram[0]
    bigram_prob_matrix[bigram] = freq_count / unigram_freq[w1]  # P(w2 | w1)


for bigram, prob in list(bigram_prob_matrix.items())[:20]:
    print(f"Bigram: {bigram}, Probability: {prob:.4f}")


Bigram: ('asian', 'export'), Probability: 0.0154
Bigram: ('export', 'fear'), Probability: 0.0011
Bigram: ('fear', 'damag'), Probability: 0.0062
Bigram: ('damag', 'japan'), Probability: 0.0076
Bigram: ('japan', 'rift'), Probability: 0.0004
Bigram: ('rift', 'mount'), Probability: 0.3333
Bigram: ('mount', 'trade'), Probability: 0.0930
Bigram: ('trade', 'friction'), Probability: 0.0049
Bigram: ('friction', 'japan'), Probability: 0.0625
Bigram: ('japan', 'rai'), Probability: 0.0007
Bigram: ('rai', 'fear'), Probability: 0.0055
Bigram: ('fear', 'among'), Probability: 0.0124
Bigram: ('among', 'mani'), Probability: 0.0069
Bigram: ('mani', 'asia'), Probability: 0.0029
Bigram: ('asia', 'export'), Probability: 0.0135
Bigram: ('export', 'nation'), Probability: 0.0018
Bigram: ('nation', 'row'), Probability: 0.0006
Bigram: ('row', 'could'), Probability: 0.0081
Bigram: ('could', 'inflict'), Probability: 0.0007
Bigram: ('inflict', 'far'), Probability: 1.0000


In [57]:
bigram_prob_matrix

{('asian', 'export'): 0.015384615384615385,
 ('export', 'fear'): 0.0010893246187363835,
 ('fear', 'damag'): 0.006211180124223602,
 ('damag', 'japan'): 0.0076045627376425855,
 ('japan', 'rift'): 0.000350385423966363,
 ('rift', 'mount'): 0.3333333333333333,
 ('mount', 'trade'): 0.09302325581395349,
 ('trade', 'friction'): 0.004943153732081068,
 ('friction', 'japan'): 0.0625,
 ('japan', 'rai'): 0.000700770847932726,
 ('rai', 'fear'): 0.00554016620498615,
 ('fear', 'among'): 0.012422360248447204,
 ('among', 'mani'): 0.006896551724137931,
 ('mani', 'asia'): 0.0028653295128939827,
 ('asia', 'export'): 0.013513513513513514,
 ('export', 'nation'): 0.0018155410312273058,
 ('nation', 'row'): 0.0006480881399870382,
 ('row', 'could'): 0.008130081300813009,
 ('could', 'inflict'): 0.0006811989100817438,
 ('inflict', 'far'): 1.0,
 ('far', 'reach'): 0.008928571428571428,
 ('reach', 'econom'): 0.00196078431372549,
 ('econom', 'damag'): 0.0009115770282588879,
 ('damag', 'businessmen'): 0.003802281368821

In [51]:
# calculate bigrams (w1,w2)- ready nltk class 

bigrams_list2 = list(bigrams(words))
print("Bigrams:", bigrams_list2[:10])  

Bigrams: [('asian', 'export'), ('export', 'fear'), ('fear', 'damag'), ('damag', 'japan'), ('japan', 'rift'), ('rift', 'mount'), ('mount', 'trade'), ('trade', 'friction'), ('friction', 'japan'), ('japan', 'rai')]


In [52]:
# calculate bigrams (w1,w2)- ready nltk class 

bigram_freq2 = FreqDist(bigrams_list2)
print("Most Common Bigrams:", bigram_freq2.most_common(50)) 

Most Common Bigrams: [(('mln', 'dlr'), 4925), (('vs', 'mln'), 3947), (('mln', 'vs'), 3921), (('ct', 'vs'), 3386), (('ct', 'net'), 2247), (('billion', 'dlr'), 2005), (('vs', 'ct'), 1934), (('vs', 'loss'), 1785), (('rev', 'mln'), 1619), (('net', 'vs'), 1579), (('shr', 'ct'), 1480), (('inc', 'lt'), 1401), (('compani', 'said'), 1379), (('last', 'year'), 1368), (('dlr', 'vs'), 1261), (('corp', 'lt'), 1129), (('net', 'shr'), 1079), (('avg', 'shr'), 1055), (('per', 'share'), 1031), (('vs', 'rev'), 1002), (('vs', 'profit'), 997), (('net', 'loss'), 955), (('loss', 'ct'), 919), (('inc', 'said'), 913), (('mln', 'stg'), 899), (('shr', 'loss'), 874), (('qtr', 'net'), 829), (('mln', 'tonn'), 827), (('mln', 'note'), 780), (('vs', 'dlr'), 770), (('net', 'profit'), 760), (('offici', 'said'), 754), (('dlr', 'per'), 731), (('corp', 'said'), 724), (('sale', 'mln'), 687), (('dlr', 'net'), 680), (('sourc', 'said'), 677), (('shr', 'vs'), 673), (('loss', 'vs'), 669), (('nine', 'mth'), 658), (('dlr', 'share'),