# Build the dataset

---

## Define filepaths

In [1]:
root_dir = '../..'
data_dir = 'data/corpus'
newsgroups_dir = 'newsgroups'

---

## Load the dataset

### Fetch the dataset using sklearn

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
newsgroups_dataset = fetch_20newsgroups(subset='all',
                                        remove=('headers', 'footers', 'quotes'),
                                        random_state=3)

### Build a Dataframe out of the data

In [4]:
import pandas as pd

In [5]:
newsgroups_df = pd.DataFrame({'text': newsgroups_dataset.data, 'label': newsgroups_dataset.target})

In [6]:
newsgroups_df.head()

Unnamed: 0,text,label
0,I have a mac plus with 2.5MB RAM. I have just ...,4
1,I was wondering if anyone knows of a graphics ...,1
2,----- News saved at 23 Apr 93 22:22:40 GMT\n ...,14
3,\nNot everyone should be trusted with tools. ;...,7
4,"\n\n\nWell, most hangings are very quick and, ...",0


In [7]:
newsgroups_df.shape

(18846, 2)

In [8]:
newsgroups_df['label'].value_counts()

10    999
15    997
8     996
9     994
11    991
13    990
7     990
5     988
14    987
2     985
12    984
3     982
6     975
1     973
4     963
17    940
16    910
0     799
18    775
19    628
Name: label, dtype: int64

## Make the text lowercase

In [9]:
newsgroups_df['text'] = newsgroups_df['text'].map(lambda x: x.lower())

In [10]:
newsgroups_df.head()

Unnamed: 0,text,label
0,i have a mac plus with 2.5mb ram. i have just ...,4
1,i was wondering if anyone knows of a graphics ...,1
2,----- news saved at 23 apr 93 22:22:40 gmt\n ...,14
3,\nnot everyone should be trusted with tools. ;...,7
4,"\n\n\nwell, most hangings are very quick and, ...",0


---

## Preprocess data

Remove leading and trailing asterisk from texts

In [11]:
def clear_asterisks(text):
    tokenized_text = text.split()
    new_tokenized_text = []
    for token in tokenized_text:
        new_token = token.strip('*')
        if new_token:
            new_tokenized_text.append(new_token)
    
    
    new_text = ' '.join(new_tokenized_text)
    return new_text

In [12]:
newsgroups_df['text'] = newsgroups_df['text'].map(lambda x: clear_asterisks(x))

In [13]:
newsgroups_df.head()

Unnamed: 0,text,label
0,i have a mac plus with 2.5mb ram. i have just ...,4
1,i was wondering if anyone knows of a graphics ...,1
2,----- news saved at 23 apr 93 22:22:40 gmt wel...,14
3,not everyone should be trusted with tools. ;-),7
4,"well, most hangings are very quick and, i imag...",0


---

## Take a sample of the data

In [14]:
newsgroups_df = newsgroups_df.groupby('label').sample(n=500, random_state=3).reset_index(drop=True)

## Convert the data to the `TrainingCorpus` format

In [15]:
newsgroups_dict = {}

### Define the docs field

In [16]:
newsgroups_dict['docs'] = newsgroups_df.index.tolist()

### Define the `texts` field

In [17]:
newsgroups_dict['texts'] = newsgroups_df['text'].tolist()

### Define the `tokens` field

In [18]:
src_dir = 'src'

In [19]:
import os
import sys

In [20]:
sys.path.append(os.path.join(root_dir, src_dir))

In [21]:
from training import TrainingCorpus

In [22]:
newsgroups_dict['tokens'] = newsgroups_df['text'].map(lambda x: TrainingCorpus.tokenize(x)).tolist()

### Define the `labels` field

In [23]:
newsgroups_dict['labels'] = sorted(newsgroups_df['label'].unique().tolist())

### Define the `target` field

In [24]:
newsgroups_dict['target'] = newsgroups_df['label'].map(lambda x: [x]).to_dict()

---

## Save to JSON file

In [25]:
import json

In [26]:
dataset_filename = 'newsgroups_corpus.json'
dataset_filepath = os.path.join(root_dir, data_dir, dataset_filename)

In [27]:
with open(dataset_filepath, 'w') as fd:
    json.dump(newsgroups_dict, fd)

---

## Open the dataset as an instance of the `TrainingCorpus` class

In [28]:
newsgroups_corpus = TrainingCorpus()
newsgroups_corpus.load(dataset_filepath)

In [29]:
newsgroups_corpus.get_text(0)

'[deleted stuff from andrew wrt which atheist myth is bill re: to] "counterfeit atheists". hmmmm. so, we\'re just cheap knock-offs of the true atheists. religion demonstrates itself to be absurd. constantly. personally, if someone asks, i\'m happy to point out how this is so. man, what is your pill wrt atheists? if you\'re going to make such contentious statements, back them up! at least, read news: time-and-time again, we\'ve hashed out the beliefs various religous doctrines hold. try debating reasonably with someone who makes a statement like, "...more accurately oxymoric is the a term like, reasonable atheist." then take a look at the responses we\'ve given tammy. seem pretty "reasonable", nay, even "polite" to me. [accusations of myths a-flyin\'] i saw your reference to "according to" in the original article. then you do such an excellent job of spewing dogma that, well, the implication was pretty clear (if wrong, in this case). [jeez, a misunderstanding. let it go.] [more statemen

In [30]:
newsgroups_corpus.get_tokens(0)[:5]

['deleted', 'stuff', 'andrew', 'wrt', 'atheist']

---

## Compute noun chunks

In [31]:
chunks_filename = 'newsgroups_chunks.json'
chunks_filepath = os.path.join(root_dir, data_dir, chunks_filename)

In [32]:
chunks_filepath

'../../data/corpus/newsgroups_chunks.json'

In [33]:
newsgroups_corpus.detect_chunks()

100%|██████████| 10000/10000 [05:55<00:00, 28.16it/s]


In [34]:
newsgroups_corpus.save_chunks(chunks_filepath)

---

## Load chunks

In [35]:
newsgroups_corpus.load_chunks(chunks_filepath)

In [36]:
list(newsgroups_corpus.noun_chunks.items())[:3]

[('stuff', 276), ('andrew_wrt', 1), ('atheist_myth', 1)]

In [37]:
newsgroups_corpus.get_chunk_document(21, threshold=0)

['sounds', 'though', 'confused', 'want', 'think', 'morally', 'right']

In [38]:
newsgroups_corpus.get_text(45)

"you should wear your nicest boxer shorts and bring plenty of spf 45+ sunscreen. i'll grab my bathing suit, towerl and some veggie hotdogs and we can have bonfire cookout!! does that sound good enough to you, dean? every a.a poster is invited!!!"

In [39]:
newsgroups_corpus.get_tokens(21)

['sounds', 'though', 'confused', 'want', 'think', 'morally', 'right']

In [40]:
newsgroups_corpus.get_text(21)

'sounds as though you are confused between "what i want" and "what i think is morally right".'

---