### Preprocessing 20 Newsgroup Data

This notebook preprocesses and pickles an embedding matrix for use in other notebooks:

* Embedding Layer
* GloVe



In [1]:
import numpy as np
import os
import pathlib

data_dir = pathlib.Path('data/20_newsgroup')
dirnames = os.listdir(data_dir)
print('Number of directories:', len(dirnames))

fnames = os.listdir(data_dir / 'comp.graphics')
print('Number of files in comp.graphics:', len(fnames))

Number of directories: 20
Number of files in comp.graphics: 1000


In [2]:
# clean up files

samples = []
labels = []
class_names = []
class_index = 0
for dirname in sorted(os.listdir(data_dir)):
    class_names.append(dirname)
    dirpath = data_dir / dirname
    fnames = os.listdir(dirpath)
    print("Processing %s, %d files found" % (dirname, len(fnames)))
    for fname in fnames:
        fpath = dirpath / fname
        f = open(fpath, encoding="latin-1")
        content = f.read()
        lines = content.split("\n")
        lines = lines[10:]
        content = "\n".join(lines)
        samples.append(content)
        labels.append(class_index)
    class_index += 1

print("Classes:", class_names)
print("Number of samples:", len(samples))

Processing alt.atheism, 1000 files found
Processing comp.graphics, 1000 files found
Processing comp.os.ms-windows.misc, 1000 files found
Processing comp.sys.ibm.pc.hardware, 1000 files found
Processing comp.sys.mac.hardware, 1000 files found
Processing comp.windows.x, 1000 files found
Processing misc.forsale, 1000 files found
Processing rec.autos, 1000 files found
Processing rec.motorcycles, 1000 files found
Processing rec.sport.baseball, 1000 files found
Processing rec.sport.hockey, 1000 files found
Processing sci.crypt, 1000 files found
Processing sci.electronics, 1000 files found
Processing sci.med, 1000 files found
Processing sci.space, 1000 files found
Processing soc.religion.christian, 997 files found
Processing talk.politics.guns, 1000 files found
Processing talk.politics.mideast, 1000 files found
Processing talk.politics.misc, 1000 files found
Processing talk.religion.misc, 1000 files found
Classes: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.ha

#### Shuffle the data and split into train/validation sets

In [4]:
# Shuffle the data
seed = 1337
rng = np.random.RandomState(seed)
rng.shuffle(samples)
rng = np.random.RandomState(seed)
rng.shuffle(labels)

# Divide into train/validation/test
split_1_portion = 0.2  # first 20% for validation
split_2_portion = 0.8  # last 20% for test
split_1 = int(split_1_portion * len(samples))
split_2 = int(split_2_portion * len(samples))

val_samples = samples[:split_1]
val_labels = labels[:split_1]

train_samples = samples[split_1:split_2]
train_labels = labels[split_1:split_2]

test_samples = samples[split_2:]
test_labels = labels[split_2:]

print('sizes of train, validation, test:', len(train_labels), len(val_labels), len(test_labels))

sizes of train, validation, test: 11998 3999 4000


#### Pickle everything for later notebooks

In [7]:
import pickle

pickle.dump(train_samples, open('data/train_samples.pkl', 'wb'))
pickle.dump(train_labels, open('data/train_labels.pkl', 'wb'))

pickle.dump(val_samples, open('data/val_samples.pkl', 'wb'))
pickle.dump(val_labels, open('data/val_labels.pkl', 'wb'))

pickle.dump(test_samples, open('data/test_samples.pkl', 'wb'))
pickle.dump(test_labels, open('data/test_labels.pkl', 'wb'))