In [2]:
!pip install nltk



In [3]:
!pip install utils



In [4]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *

import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
vocabulary_size = 5000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"


In [6]:
# Download NLTK model data (you need to do this once)
nltk.download("book")

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/kenny/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package conll2002 is already up-to-date!
[nltk_data]    | Downloading package dependency_t

True

In [9]:
with open('reddit-comments-2015-08.csv', 'r', newline='', encoding='utf-8') as f:
    # Initalize a reader object
    reader = csv.reader(f, skipinitialspace=True)
    # Skip the header row
    next(reader)  
    # Split full comments into sentences  - [nltk.sent_tokenize(x[0].lower()) for x in reader] - for the paragraph x[0] from the csv file, make it lowercase and tokenize all sentence
    # For all pararaphs in the csv file. * operator unpacks the list into individual sentences, and creates a single iterable
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    # Replace all sentence x in sentences with the start token, sentence body, and text token"
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
    print(sentences[1:10])
print (f"Parsed {len(sentences)} sentences.")

["SENTENCE_START it's a slight ppr league- .2 ppr. SENTENCE_END", 'SENTENCE_START standard besides 1 points for 15 yards receiving, .2 points per completion, 6 points per td thrown, and some bonuses for rec/rush/pass yardage. SENTENCE_END', 'SENTENCE_START my question is, is it wildly clear that qb has the highest potential for points? SENTENCE_END', 'SENTENCE_START i put in the rules at a ranking site and noticed that top qbs had 300 points more than the top rb/wr. SENTENCE_END', 'SENTENCE_START would it be dumb not to grab a qb in the first round? SENTENCE_END', 'SENTENCE_START in your scenario, a person could just not run the mandatory background check on the buyer and still sell the gun to the felon. SENTENCE_END', "SENTENCE_START there's no way to enforce it. SENTENCE_END", "SENTENCE_START an honest seller is going to not sell the gun to them when they see they're a felon on the background check. SENTENCE_END", "SENTENCE_START a dishonest seller isn't going to run the check in the

In [10]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [11]:
print(tokenized_sentences[1])

['SENTENCE_START', 'it', "'s", 'a', 'slight', 'ppr', 'league-', '.2', 'ppr', '.', 'SENTENCE_END']


In [12]:
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print (f"Found { len(word_freq.items()) } unique words tokens." )

Found 63023 unique words tokens.


In [13]:
# Get 7999 most common words
vocab = word_freq.most_common(vocabulary_size-1)
print("Vocab:")
print(vocab[1:20])
index_to_word = [ x[0] for x in vocab ]
# unknown_token = "UNKNOWN_TOKEN"
index_to_word.append(unknown_token)
print(index_to_word[-1])
print("Index to word:")
print(index_to_word[1:10])
# index_to_word is a list of 8000 words ['word1', 'word2']
# enumerate is an object that generates index value pairs in that order
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
# Input a word, it goes into a dictionary, that gets translated to the index of the dictionary of index_to_word
# This allows us to represent words as numbers
print(word_to_index["SENTENCE_END"])
print(word_to_index["UNKNOWN_TOKEN"])
print(word_to_index["apple"])

Vocab:
[('SENTENCE_END', 79184), ('.', 67334), ('the', 52419), (',', 52137), ('to', 35576), ('i', 32614), ('a', 31777), ('and', 30055), ('of', 23232), ('you', 22457), ('it', 22353), ('that', 19334), ('is', 18196), ('in', 16944), ('*', 14955), ('for', 12541), ("n't", 11784), ("'s", 11771), (')', 11409)]
UNKNOWN_TOKEN
Index to word:
['SENTENCE_END', '.', 'the', ',', 'to', 'i', 'a', 'and', 'of']
1
4999
1371


In [14]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    # Change all words not in word_to_index to unknown_token
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

print(tokenized_sentences[1:20])
word_to_index["it"]

[['SENTENCE_START', 'it', "'s", 'a', 'slight', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', '.', 'SENTENCE_END'], ['SENTENCE_START', 'standard', 'besides', '1', 'points', 'for', '15', 'yards', 'receiving', ',', 'UNKNOWN_TOKEN', 'points', 'per', 'UNKNOWN_TOKEN', ',', '6', 'points', 'per', 'UNKNOWN_TOKEN', 'thrown', ',', 'and', 'some', 'bonuses', 'for', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', '.', 'SENTENCE_END'], ['SENTENCE_START', 'my', 'question', 'is', ',', 'is', 'it', 'UNKNOWN_TOKEN', 'clear', 'that', 'qb', 'has', 'the', 'highest', 'potential', 'for', 'points', '?', 'SENTENCE_END'], ['SENTENCE_START', 'i', 'put', 'in', 'the', 'rules', 'at', 'a', 'ranking', 'site', 'and', 'noticed', 'that', 'top', 'qbs', 'had', '300', 'points', 'more', 'than', 'the', 'top', 'UNKNOWN_TOKEN', '.', 'SENTENCE_END'], ['SENTENCE_START', 'would', 'it', 'be', 'dumb', 'not', 'to', 'grab', 'a', 'qb', 'in', 'the', 'first', 'round', '?', 'SENTENCE_END'], ['SENTENCE_START', 'in', 'your', 'scenari

11

In [15]:
# Create the training data
X_train = np.array( [ [word_to_index[w] for w in sent[:-1] ]  for sent in tokenized_sentences ] )
y_train = np.array([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (79184,) + inhomogeneous part.

In [16]:
X_train_list = []
y_train_list = []

# Iterate over tokenized_sentences
for sent in tokenized_sentences:
    X_row = []
    y_row = []
    
    # Iterate over words in the sentence
    for w in sent[:-1]:
        X_row.append(word_to_index[w])
    
    for w in sent[1:]:
        y_row.append(word_to_index[w])
    
    X_train_list.append(X_row)
    y_train_list.append(y_row)

# Convert lists to NumPy arrays
X_train = np.array(X_train_list)
y_train = np.array(y_train_list)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (79184,) + inhomogeneous part.

In [19]:
# Initialize empty lists to store X_train and y_train
X_train = []
y_train = []

# Iterate over tokenized_sentences
for sent in tokenized_sentences:
    X_row = []
    y_row = []
    
    # Iterate over words in the sentence
    for w in sent[:-1]:
        X_row.append(word_to_index.get(w, 0))
    
    for w in sent[1:]:
        y_row.append(word_to_index.get(w, 0))
    
    X_train.append(X_row)
    y_train.append(y_row)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (79184,) + inhomogeneous part.

In [25]:
# Print an training data example
x_example, y_example = X_train[17], y_train[17]
print(f"x:\n{' '.join([index_to_word[x] for x in x_example])}\n{x_example}")
print(f"y:\n{' '.join([index_to_word[x] for x in y_example])}\n{y_example}")

print(X_train[1:3])
print(y_train[1:3])

x:
SENTENCE_START what are n't you understanding about this ? !
[0, 52, 28, 17, 10, 858, 55, 26, 35, 70]
y:
what are n't you understanding about this ? ! SENTENCE_END
[52, 28, 17, 10, 858, 55, 26, 35, 70, 1]
[[0, 11, 18, 7, 3030, 4999, 4999, 4999, 4999, 2], [0, 981, 1496, 221, 600, 16, 773, 3414, 2967, 4, 4999, 600, 471, 4999, 4, 435, 600, 471, 4999, 2722, 4, 8, 72, 4959, 16, 4999, 4999, 2]]
[[11, 18, 7, 3030, 4999, 4999, 4999, 4999, 2, 1], [981, 1496, 221, 600, 16, 773, 3414, 2967, 4, 4999, 600, 471, 4999, 4, 435, 600, 471, 4999, 2722, 4, 8, 72, 4959, 16, 4999, 4999, 2, 1]]
