In [9]:
import sklearn
sklearn.__version__
from sklearn.datasets import fetch_20newsgroups

In [182]:
dataset = fetch_20newsgroups()
test_data = fetch_20newsgroups(subset='test')

In [183]:
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [184]:
raw_text = dataset.data
target = dataset.target
print(raw_text[500],dataset.target_names[target[500]])

From: bjorndahl@augustana.ab.ca
Subject: Re: document of .RTF
Organization: Augustana University College, Camrose, Alberta
Lines: 10

In article <1993Mar30.113436.7339@worak.kaist.ac.kr>, tjyu@eve.kaist.ac.kr (Yu TaiJung) writes:
> Does anybody have document of .RTF file or know where I can get it?
> 
> Thanks in advance. :)

I got one from Microsoft tech support.

-- 
Sterling G. Bjorndahl, bjorndahl@Augustana.AB.CA or bjorndahl@camrose.uucp
Augustana University College, Camrose, Alberta, Canada      (403) 679-1100
 comp.os.ms-windows.misc


## Preprocess text

- split metadata and text.
- For metadata

    - Pick only Subject from metadata.

- For text
    1. split into sentences
    2. lower case each sentence
    3. tokenize into words
    
- Tokenizer: 
    1. delete email address
    2. collection of numbers => NUM
    2. keep \$
    3. delete all other punctuations

- Maintain a word list for words that appear more than once.

In [90]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
# split metadata and text
def split_metadata(data):
    occurrence = data.find("\n\n")
    metadata = data[0:occurrence] + "\n"
    text = data[occurrence+2:]
    return metadata, text
# get subject from metadata
def get_subject(metadata):
    # subject start with Subject: end with \n
    regex = 'Subject: (.*)\n'
    match = re.search(regex, metadata)
    return match.group(1)
# delete email address
def del_email(text):
    regex = '\S*@\S*'
    return re.sub(regex, " EMAIL ", text)
# replace number collection
def replace_num(text):
    regex = '[0-9]+'
    return re.sub(regex, " NUM ", text)
# remove special characters
def remove(text):
    regex = '[^\w\s$]'
    return re.sub(regex," ", text)
# process raw text
def process_text(text):
    return [lemmatizer.lemmatize(word) for word in word_tokenize(remove(replace_num(del_email(text.lower()))))\
            if word not in stop_words]

def process_data(data):
    metadata, text = split_metadata(data)
    subject = get_subject(metadata)
    
    tokenized_text = []
    for sent in sent_tokenize(text):
        tokenized_text += process(sent)
        
    tokenized_subject = []
    for sent in sent_tokenize(subject):
        tokenized_subject += process(sent)
    return tokenized_subject, tokenized_text

In [92]:
print(process_data(raw_text[1]))

(['si', 'clock', 'poll', 'final', 'call'], ['fair', 'number', 'brave', 'soul', 'upgraded', 'si', 'clock', 'oscillator', 'shared', 'experience', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'experience', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'card', 'adapter', 'heat', 'sink', 'hour', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'NUM', 'NUM', 'NUM', 'floppy', 'especially', 'requested', 'summarizing', 'next', 'two', 'day', 'please', 'add', 'network', 'knowledge', 'base', 'done', 'clock', 'upgrade', 'answered', 'poll', 'thanks', 'guy', 'kuo', 'EMAIL'])


In [93]:
tokenized_subject = []
tokenized_text = []
for data in raw_text:
    subject, text = process_data(data)
    tokenized_subject.append(subject)
    tokenized_text.append(text)

print(tokenized_subject[0])
print(tokenized_text[1])

['car']
['fair', 'number', 'brave', 'soul', 'upgraded', 'si', 'clock', 'oscillator', 'shared', 'experience', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'experience', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'card', 'adapter', 'heat', 'sink', 'hour', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'NUM', 'NUM', 'NUM', 'floppy', 'especially', 'requested', 'summarizing', 'next', 'two', 'day', 'please', 'add', 'network', 'knowledge', 'base', 'done', 'clock', 'upgrade', 'answered', 'poll', 'thanks', 'guy', 'kuo', 'EMAIL']


In [None]:
tokeinzed_test_subject = 

In [76]:
class Language:
    def __init__(self):
        self.word2index = {'NUM':0, 'EMAIL':1}
        self.index2word = {0: 'NUM', 1: 'EMAIL'}
        self.word2count = {'NUM':0, 'EMAIL':0}
        self.vocab_count = 2
    def addWord(self, word):
        if word in self.word2index:
            self.word2count[word] += 1
        else:
            self.word2index[word] = self.vocab_count
            self.index2word[self.vocab_count] = word
            self.word2count[word] = 1
            self.vocab_count += 1

In [77]:
language = Language()
for subject in tokenized_subject:
    for word in subject:
        language.addWord(word)
for text in tokenized_text:
    for sent in text:
        for word in sent:
            language.addWord(word)

In [160]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
# Create a corpus from a list of texts
dictionary = Dictionary(tokenized_text)
dictionary.filter_extremes(no_below=3)
id2token = {dictionary.token2id[key]:key for key in dictionary.token2id}
corpus = [dictionary.doc2bow(text) for text in tokenized_text]

# Train the model on the corpus.
#lda = LdaModel(common_corpus, num_topics=10)

In [161]:
print(tokenized_text[0])

['wondering', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', 'NUM', 'door', 'sport', 'car', 'looked', 'late', 'NUM', 'early', 'NUM', 'called', 'bricklin', 'door', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'made', 'history', 'whatever', 'info', 'funky', 'looking', 'car', 'please', 'e', 'mail', 'thanks', 'il', 'brought', 'neighborhood', 'lerxst']


In [177]:
lda = LdaModel(corpus, num_topics=40)

In [179]:
for topicid in range(40):
    words = lda.get_topic_terms(topicid, 10)
    print("\nTOPIC", topicid)
    for word in words:
        print(id2token[word[0]], word[1])


TOPIC 0
law 0.006431582
people 0.0063895327
president 0.006371348
would 0.0062407856
mr 0.005817471
homosexual 0.00553603
gay 0.0047088563
american 0.004490718
one 0.004413492
u 0.004280061

TOPIC 1
god 0.017337106
would 0.010128813
people 0.009740007
one 0.009711972
say 0.0076309047
jesus 0.0071131466
think 0.0069431816
christian 0.0066268863
time 0.005566497
article 0.0053566946

TOPIC 2
space 0.0144479405
would 0.008123416
one 0.006169648
article 0.005750418
year 0.005576694
moon 0.004814346
earth 0.0047900146
disease 0.0043016686
launch 0.00424096
time 0.004181001

TOPIC 3
motif 0.01399266
software 0.010708763
version 0.008588853
r 0.0073932535
type 0.0059639467
graphic 0.0057806023
window 0.005261542
anyone 0.0048559792
run 0.004808031
sun 0.0046218676

TOPIC 4
orbit 0.010910105
mission 0.010855371
db 0.01016462
space 0.00946086
nasa 0.008483583
spacecraft 0.008471987
hawk 0.007453851
data 0.006903461
probe 0.0068511157
b 0.0063499

TOPIC 5
scsi 0.04312879
ide 0.018520363
drive 0

In [181]:
print(corpus[0])

[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 4), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1)]


In [171]:
words = lda.get_topic_terms(1, 10)
for word in words:
    print(id2token[word[0]])

$
drive
scsi
b
mb
ide
controller
jumper
bus
offer


In [172]:
words = lda.get_topic_terms(5, 10)
for word in words:
    print(id2token[word[0]], word[1])

gun 0.021801889
law 0.010995762
government 0.009796374
weapon 0.009748626
state 0.009195521
right 0.008972493
would 0.008389489
firearm 0.0075030485
crime 0.007343184
stephanopoulos 0.006955789
