# Develop Deep Learning Models for Natural Language in Python

In [1]:
import re
import os
import string
import nltk
from collections import Counter

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Chapter 9 - How to Prepare IMDB Reviews for Sentimental Analysis

In [2]:
def load_doc(filename):
  ''' Load a single .txt file given it's full path '''

  text = None
  with open(filename, 'r') as file:
    text = file.read()
  return text

def clean_doc(text):
  ''' Cleans a given text '''

  # Split into tokens
  tokens = text.split(' ')
  # Prepaer regex for string manipulation
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  # Remove punctuations
  tokens = [re_punc.sub('', token) for token in tokens]
  # Remove none-alphabetic tokens
  tokens = [token for token in tokens if token.isalpha()]
  # Filter out stopwords
  stop_words = set(nltk.corpus.stopwords.words('english'))
  tokens = [token for token in tokens if token not in stop_words]
  # Filter out short tokens
  tokens = [token for token in tokens if len(token) > 1]
  
  return tokens

def add_doc_to_vocab(filename, voacb):
  ''' Loads, cleans and then adds the text file to the vocabulary '''

  # Load file
  doc = load_doc(filename)
  # Clean file
  tokens = clean_doc(doc)
  # Update vocab
  vocab.update(tokens)

def process_docs(dir, vocab):
  ''' Load all .txt files within a directory given full path '''
  for filename in os.listdir(dir):
    if not filename.endswith('.txt'):
      break

    print(f'Processing document \'{filename}\'...')
    path = dir + '/' + filename
    add_doc_to_vocab(path, vocab)

In [3]:
def save_list(lines, filename):

  data = '\n'.join(lines)
  with open(filename, 'w') as file:
    file.write(data)

In [4]:
# Define vocabulary
vocab = Counter()

root = '/content/drive/MyDrive/NLP-In-Depth/Develop Deep Learning Models for Natural Language in Python/txt_sentoken'

# Process negative & positive files
process_docs(root + '/neg', vocab)
process_docs(root + '/pos', vocab)

# Print vocab summary
print('Vocabulary Size:', len(vocab))
print('50 Most Common Words:',  vocab.most_common(50))

Processing document 'cv000_29416.txt'...
Processing document 'cv001_19502.txt'...
Processing document 'cv002_17424.txt'...
Processing document 'cv003_12683.txt'...
Processing document 'cv004_12641.txt'...
Processing document 'cv005_29357.txt'...
Processing document 'cv006_17022.txt'...
Processing document 'cv007_4992.txt'...
Processing document 'cv008_29326.txt'...
Processing document 'cv009_29417.txt'...
Processing document 'cv010_29063.txt'...
Processing document 'cv011_13044.txt'...
Processing document 'cv012_29411.txt'...
Processing document 'cv013_10494.txt'...
Processing document 'cv014_15600.txt'...
Processing document 'cv015_29356.txt'...
Processing document 'cv016_4348.txt'...
Processing document 'cv017_23487.txt'...
Processing document 'cv018_21672.txt'...
Processing document 'cv019_16117.txt'...
Processing document 'cv020_9234.txt'...
Processing document 'cv021_17313.txt'...
Processing document 'cv022_14227.txt'...
Processing document 'cv023_13847.txt'...
Processing document

In [5]:
# Filter least commonly used words
min_occurance = 5

tokens = [k for k, c in vocab.items() if c >= min_occurance]
print('Vocab Size:', len(tokens))

Vocab Size: 14521


In [7]:
# Save vocabulary
save_list(tokens, root + '/vocab.txt')

In [9]:
def doc_to_line(filename, vocab):
  ''' Loads, cleans and returns a list of tokens that already exist in vocabulary '''

  doc = load_doc(filename)
  tokens = clean_doc(doc)
  tokens = [token for token in tokens if token in vocab]
  return ' '.join(tokens)

def process_line_docs(dir, vocab):
  ''' Load all .txt files within a directory given full path '''

  lines = []
  for filename in os.listdir(dir):
    if not filename.endswith('.txt'):
      break

    print(f'Processing document \'{filename}\'...')    

    path = dir + '/' + filename
    line = doc_to_line(path, vocab)
    lines.append(line)

  return lines

def save_lines(lines, filename):
  ''' Saves the lines into a file '''

  data = '\n'.join(lines)
  with open(filename, 'w') as file:
    file.write(data)

In [11]:
# Process negative review & store them
neg_lines = process_line_docs(root + '/neg', vocab)
save_lines(neg_lines, root + '/negative.txt')

# Process positive review & store them
pos_lines = process_line_docs(root + '/pos', vocab)
save_lines(pos_lines, root + '/positive.txt')

Processing document 'cv000_29416.txt'...
Processing document 'cv001_19502.txt'...
Processing document 'cv002_17424.txt'...
Processing document 'cv003_12683.txt'...
Processing document 'cv004_12641.txt'...
Processing document 'cv005_29357.txt'...
Processing document 'cv006_17022.txt'...
Processing document 'cv007_4992.txt'...
Processing document 'cv008_29326.txt'...
Processing document 'cv009_29417.txt'...
Processing document 'cv010_29063.txt'...
Processing document 'cv011_13044.txt'...
Processing document 'cv012_29411.txt'...
Processing document 'cv013_10494.txt'...
Processing document 'cv014_15600.txt'...
Processing document 'cv015_29356.txt'...
Processing document 'cv016_4348.txt'...
Processing document 'cv017_23487.txt'...
Processing document 'cv018_21672.txt'...
Processing document 'cv019_16117.txt'...
Processing document 'cv020_9234.txt'...
Processing document 'cv021_17313.txt'...
Processing document 'cv022_14227.txt'...
Processing document 'cv023_13847.txt'...
Processing document