## 1. Wikipedia Text

In [20]:
# Datasets provided (in download link) is compressed version of wikipedia
# First download text8.zip as explained in download_links.md
import os
import zipfile
import numpy as np
import collections
import tensorflow as tf

In [14]:
# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

In [17]:
# the list is tokenized version (words) from document corpus
vocabulary = read_data('text8.zip')

In [18]:
# try printing first 5 words in list
vocabulary[:5]

['anarchism', 'originated', 'as', 'a', 'term']

In [19]:
# Building vocabulary index and training sets from n-chosen vocabulary size 
# (e.g 10.000 most frequent words)
# training sets is integer format of list of words
def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [21]:
vocab_size = 10000
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocab_size)

In [33]:
# vocabulary size as previously chosen 
len(dictionary)

10000

In [34]:
# print 10 first word and its index in vocabulary
list(dictionary.items())[:10]

[('hit', 1068),
 ('baptism', 3244),
 ('physical', 659),
 ('cross', 713),
 ('young', 527),
 ('byron', 6469),
 ('asking', 5590),
 ('dissolution', 6964),
 ('scales', 4805),
 ('baghdad', 7164)]

In [35]:
# number of training sets (bag-of-words)
len(data)

17005207

In [36]:
# integer (numerical) format as input of model
data[:5]

[5237, 3081, 12, 6, 195]

In [37]:
# reverse vocabulary index
list(reverse_dictionary.items())[:10]

[(0, 'UNK'),
 (1, 'the'),
 (2, 'of'),
 (3, 'and'),
 (4, 'one'),
 (5, 'in'),
 (6, 'a'),
 (7, 'to'),
 (8, 'zero'),
 (9, 'nine')]

In [40]:
# reconstructing back integer list to string format of original training sets
def indexToWords(vocab,data):
    strData = [vocab[i] for i in data]
    return strData

In [41]:
strData = indexToWords(reverse_dictionary,data)

In [42]:
strData[:10]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

In [44]:
import _pickle as cPickle
# saving file into pickle format
def savePickle(dataToWrite,pickleFilename):
	f = open(pickleFilename, 'wb')
	cPickle.dump(dataToWrite, f)
	f.close()

In [45]:
# reading file in pickle format
def readPickle(pickleFilename):
	f = open(pickleFilename, 'rb')
	obj = cPickle.load(f)
	f.close()
	return obj

In [46]:
# save all preprocessed data into pickle (so that it can be reused anytime 
# without repeating preprocessing stage)
savePickle(data,'wikipedia_trainset')
savePickle(dictionary,'wikipedia_vocab')
savePickle(reverse_dictionary,'wikipedia_reversevocab')
savePickle(count,'wikipedia_tf')

## 2. TED Talk 

In [7]:
# Datasets provided is english script of TED Talk video
# Sample of raw text as follows:



```
<seekvideo id="835">Mark Twain summed up what I take to be</seekvideo>
<seekvideo id="2990">one of the fundamental problems of cognitive science</seekvideo>
<seekvideo id="6110">with a single witticism.</seekvideo>

```


## 3. PennTreeBank

In [9]:
# Unzip the downloaded file 
# Data sets provided is in data/, example of raw data as follows


```
he will continue to report to donald <unk> president and chief executive officer 
mr. stevens was executive vice president of this <unk> holding company 
arthur a. hatch N was named executive vice president of the company 
he was previously president of the company 's eastern edison co. unit 
```

## 4. IMDB

## 5. 20NewsGroups

## 6. Amazon Reviews

In [10]:
# For Sentiment analysis:
# Data sets is provided with binary labels: Negative (1) and Positive (2)

```
__label__1 The Worst!: A complete waste of time. Typographical errors, poor grammar, and a totally pathetic plot add up to absolutely nothing. I'm embarrassed for this author and very disappointed I actually paid for this book.
__label__2 Great book: This was a great book,I just could not put it down,and could not read it fast enough. Boy what a book the twist and turns in this just keeps you guessing and wanting to know what is going to happen next. This book makes you fall in love and can heat you up,it can also make you so angery. this book can make you go throu several of your emotions. This is a quick read romance. It is something that you will want to end your day off with if you read at night.
```

## 7. Lingspam

## 8. Enron