In [100]:
import zipfile, os, nltk, json, gzip
from nltk.corpus import PlaintextCorpusReader
from collections import Counter

In [83]:
def convert_tuple_bigrams(tuples_to_convert):
    string_grams = []
    for tuple_grams in tuples_to_convert:
        first_word = tuple_grams[0]
        second_word = tuple_grams[1]
        gram_string = f'{first_word} {second_word}'
        string_grams.append(gram_string)
    return string_grams

def convert_tuple_trigrams(tuples_to_convert):
    string_grams = []
    for tuple_grams in tuples_to_convert:
        first_word = tuple_grams[0]
        second_word = tuple_grams[1]
        third_word = tuple_grams[2]
        gram_string = f'{first_word} {second_word} {third_word}'
        string_grams.append(gram_string)
    return string_grams

def convert_strings_to_counts(string_grams):
    counter_of_grams = Counter(string_grams)
    dict_of_grams = dict(counter_of_grams)
    return dict_of_grams

In [None]:
# Extract Zip File of Texts
import zipfile, os

filename = './data/texts.zip'

try:
    corpus_zip = zipfile. ZipFile(filename)
    corpus_zip.extractall('./data/')
    corpus_zip.close()
    print('Zip file extracted successfully.')
except:
    print('No zip file detected. Upload your zip file to the data folder.')

In [21]:
# Establish root folder holding all text files
# Create corpus using all text files in corpus_root
from nltk.corpus import PlaintextCorpusReader
corpus_root = './data/texts'
corpus = PlaintextCorpusReader(corpus_root, '.*txt')

In [80]:
# Print all File IDs in corpus based on text file names
text_list = corpus.fileids()
print(f'Corpus created from: {text_list}')

Corpus created from: ['120-0.txt', '158-0.txt', '1952-0.txt', '3600-0.txt', '98-0.txt', 'pg514.txt']


In [98]:
# Compute unigrams, bigrams, trigrams, and wordCount

for text in text_list:
    
    # Compute unigrams
    unigrams = corpus.words(text)
    unigramCount = convert_strings_to_counts(unigrams)
    
    # Compute bigrams
    tuple_bigrams = list(nltk.bigrams(unigrams))
    string_bigrams = convert_tuple_bigrams(tuple_bigrams)
    bigramCount = convert_strings_to_counts(string_bigrams)
    
    # Compute trigrams
    tuple_trigrams = list(nltk.trigrams(unigrams))
    string_trigrams = convert_tuple_trigrams(tuple_trigrams)
    trigramCount = convert_strings_to_counts(string_trigrams)
    
    # Calculate wordCount
    wordCount = len(unigrams)
    
    # Create a dictionary `data` to hold each document's data
    # Including id, wordCount, outputFormat, unigramCount,
    # bigramCount, trigramCount, fullText, etc.
    data = {}
    data.update([
        ('id', text),
        ('outputFormat', ['unigram', 'bigram', 'trigram', 'fullText']),
        ('wordCount', wordCount),
        ('fullText', 'placeholder')
        ('unigramCount', unigramCount), 
        ('bigramCount', bigramCount), 
        ('trigramCount', trigramCount)
    ])
    
    # Write the document to the json file
    with open('./data/data.jsonl', 'a') as outfile:
        json.dump(data, outfile)
        outfile.write('\n')
        print(f'Text {text} written to json-l file.')

print('Process complete. All documents written to json-l file.')

# GZip dataset

f_in = open('./data/data.jsonl', 'rb')
f_out = gzip.open('./data/data.jsonl.gz', 'wb')
f_out.writelines(f_in)
f_out.close()
f_in.close()

print('Dataset successfully compressed.')


Text 120-0.txt written to json-l file.
Text 158-0.txt written to json-l file.
Text 1952-0.txt written to json-l file.
Text 3600-0.txt written to json-l file.
Text 98-0.txt written to json-l file.
Text pg514.txt written to json-l file.
Process complete. All documents written to json-l file.
