# Downloading the data

In [3]:
%%bash
mkdir -p data
pushd data
if [ -d "20news-bydate-train" ]
then
  echo "The data has already been downloaded..."
else
  wget http://qwone.com/%7Ejason/20Newsgroups/20news-bydate.tar.gz
  tar xfv 20news-bydate.tar.gz
  rm 20news-bydate.tar.gz
fi
echo "Lets take a look at the groups..."
ls 20news-bydate-train/
popd

~/Documents/20NewsGroupVisualization/preprocessing/data ~/Documents/20NewsGroupVisualization/preprocessing
The data has already been downloaded...
Lets take a look at the groups...
alt.atheism
comp.graphics
comp.os.ms-windows.misc
comp.sys.ibm.pc.hardware
comp.sys.mac.hardware
comp.windows.x
misc.forsale
rec.autos
rec.motorcycles
rec.sport.baseball
rec.sport.hockey
sci.crypt
sci.electronics
sci.med
sci.space
soc.religion.christian
talk.politics.guns
talk.politics.mideast
talk.politics.misc
talk.religion.misc
~/Documents/20NewsGroupVisualization/preprocessing


In [6]:
ls -lah data/20news-bydate-train/sci.space | tail -n 5

-rw-r--r--  1 peter peter 1.5K Mar 18  2003 61250
-rw-r--r--  1 peter peter  889 Mar 18  2003 61252
-rw-r--r--  1 peter peter 1.2K Mar 18  2003 61264
-rw-r--r--  1 peter peter 1.7K Mar 18  2003 61308
-rw-r--r--  1 peter peter 1.4K Mar 18  2003 61422


In [7]:
!head data/20news-bydate-train/sci.space/61422 -n 20

From: ralph.buttigieg@f635.n713.z3.fido.zeta.org.au (Ralph Buttigieg)
Subject: Why not give $1 billion to first year-lo
Organization: Fidonet. Gate admin is fido@socs.uts.edu.au
Lines: 34

Original to: keithley@apple.com
G'day keithley@apple.com

21 Apr 93 22:25, keithley@apple.com wrote to All:

 kc> keithley@apple.com (Craig Keithley), via Kralizec 3:713/602


 kc> But back to the contest goals, there was a recent article in AW&ST
about a
 kc> low cost (it's all relative...) manned return to the moon.  A General
 kc> Dynamics scheme involving a Titan IV & Shuttle to lift a Centaur upper
 kc> stage, LEV, and crew capsule.  The mission consists of delivering two
 kc> unmanned payloads to the lunar surface, followed by a manned mission.
 kc> Total cost:  US was $10-$13 billion.  Joint ESA(?)/NASA project was


# Loading and tokenizing the corpus

In [8]:
from glob import glob
import re
import string
import funcy as fp
from gensim import models
from gensim.corpora import Dictionary, MmCorpus
import nltk
import pandas as pd

In [12]:
# quick and dirty....
EMAIL_REGEX = re.compile(r"[a-z0-9\.\+_-]+@[a-z0-9\._-]+\.[a-z]*")
FILTER_REGEX = re.compile(r"[^a-z '#]")
TOKEN_MAPPINGS = [(EMAIL_REGEX, "#email"), (FILTER_REGEX, ' ')]

def tokenize_line(line):
    res = line.lower()
    for regexp, replacement in TOKEN_MAPPINGS:
        res = regexp.sub(replacement, res)
    return res.split()
    
def tokenize(lines, token_size_filter=2):
    tokens = fp.mapcat(tokenize_line, lines)
    return [t for t in tokens if len(t) > token_size_filter]
    

def load_doc(filename):
    group, doc_id = filename.split('/')[-2:]
    with open(filename) as f:
        doc = f.readlines()
    return {'group': group,
            'doc': doc,
            'tokens': tokenize(doc),
            'id': doc_id}


docs = pd.DataFrame(list(map(load_doc, glob('data/20news-bydate-train/*/*')))).set_index(['group','id'])
docs.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,doc,tokens
group,id,Unnamed: 2_level_1,Unnamed: 3_level_1
rec.sport.baseball,104363,[From: re4@prism.gatech.EDU (RUSSELL EARNEST)\...,"[from, #email, russell, earnest, subject, play..."
rec.sport.baseball,102652,[From: niepornt@phoenix.Princeton.EDU (David M...,"[from, #email, david, marc, nieporent, subject..."
rec.sport.baseball,104503,[From: stlouis@unixg.ubc.ca (Phill St. Louis)\...,"[from, #email, phill, louis, subject, billy, t..."
rec.sport.baseball,104411,[From: krattige@hpcc01.corp.hp.com (Kim Kratti...,"[from, #email, kim, krattiger, subject, kevin,..."
rec.sport.baseball,102659,"[From: carrd@iccgcc.decnet.ab.com\n, Subject: ...","[from, #email, subject, david, wells, lines, h..."


# Creating the dictionary, and bag of words corpus

In [13]:
def nltk_stopwords():
    return set(nltk.corpus.stopwords.words('english'))

def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5):
  print('Building dictionary...')
  dictionary = Dictionary(docs)
  stopwords = nltk_stopwords().union(additional_stopwords)
  stopword_ids = map(dictionary.token2id.get, stopwords)
  dictionary.filter_tokens(stopword_ids)
  dictionary.compactify()
  dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
  dictionary.compactify()

  print('Building corpus...')
  corpus = [dictionary.doc2bow(doc) for doc in docs]

  return dictionary, corpus

In [14]:
dictionary, corpus = prep_corpus(docs['tokens'])

Building dictionary...
Building corpus...


# Save the dictionary and the corpus

In [15]:
MmCorpus.serialize('newsgroups.mm', corpus)
dictionary.save('newgroups.dict')

# Fitting the LDA model

In [16]:
%%time
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=50, passes=10)
lda.save('newsgroups_50.model')

CPU times: user 1h 12min 8s, sys: 2min 10s, total: 1h 14min 18s
Wall time: 15min 48s
