# Downloading the data

In [None]:
%%bash
mkdir -p data
pushd data
if [ -d "20news-bydate-train" ]
then
  echo "The data has already been downloaded..."
else
  wget http://qwone.com/%7Ejason/20Newsgroups/20news-bydate.tar.gz
  tar xfv 20news-bydate.tar.gz
  rm 20news-bydate.tar.gz
fi
echo "Lets take a look at the groups..."
ls 20news-bydate-train/
popd

In [None]:
ls -lah data/20news-bydate-train/sci.space | tail -n 5

In [None]:
!head data/20news-bydate-train/sci.space/61422 -n 20

# Loading and tokenizing the corpus

In [None]:
from glob import glob
import re
import string
import funcy as fp
from gensim import models
from gensim.corpora import Dictionary, MmCorpus
import nltk
import pandas as pd
from pattern.en import parse
import logging

In [None]:
logging.basicConfig(filename="logging.txt", format='%(asctime)s : %(levelname)s : %(message)s',filemode ="w", level=logging.INFO)
# quick and dirty...
EMAIL_REGEX = re.compile(r"[a-z0-9\.\+_-]+@[a-z0-9\._-]+\.[a-z]*")
FILTER_REGEX = re.compile(r"[^a-z '#]")
TOKEN_MAPPINGS = [(EMAIL_REGEX, "#email"), (FILTER_REGEX, ' ')]

def tokenize_line(line):
    res = line.lower()
    for regexp, replacement in TOKEN_MAPPINGS:
        res = regexp.sub(replacement, res)

    sentence = parse(res,tokenize=True,tags=False, chunks=False, relations= False, lemmata=True).split()
    
    # initialize the Variables
    allowed_tags = re.compile('(NN|VB|JJ|RB)')
    stopwords = frozenset()
    min_length = 2
    max_length = 15
    result = []
    
    # lemmatization of the words
    try:
        sentence = sentence[0]
    except IndexError:
        pass 
    
    for token, tag, lemma in sentence:
        if min_length <= len(lemma) <= max_length and lemma not in stopwords:
            if allowed_tags.match(tag):
                lemma += "/" + tag[:2]
                result.append(lemma.encode('utf8'))
    res = result
    logging.info("That's how res looks %s" %res)
    return res
    
def tokenize(lines, token_size_filter=2):
    tokens = fp.mapcat(tokenize_line, lines)
    return [t for t in tokens if len(t) > token_size_filter]
    

def load_doc(filename):
    # Slash for linux and double backslash for windows
    group, doc_id = filename.split('\\')[-2:]
    with open(filename) as f:
        doc = f.readlines()
    logging.info("logging in %s in doc %s" %(group, doc_id))
    return {'group': group,
            'doc': doc,
            'tokens': tokenize(doc),
            'id': doc_id}


docs = pd.DataFrame(list(map(load_doc, glob('data/20news-bydate-train/*/*')))).set_index(['group','id'])
docs.head()
# save dataframe to csv file for later usage
docs.to_csv("data/model/docs.csv")