In [9]:
import sklearn
sklearn.__version__
from sklearn.datasets import fetch_20newsgroups

In [203]:
dataset = fetch_20newsgroups()
test_data = fetch_20newsgroups(subset='test')

In [183]:
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [204]:
raw_text = dataset.data
test_text = test_data.data
target = dataset.target
test_target = test_data.target
print(raw_text[500],dataset.target_names[target[500]])

From: bjorndahl@augustana.ab.ca
Subject: Re: document of .RTF
Organization: Augustana University College, Camrose, Alberta
Lines: 10

In article <1993Mar30.113436.7339@worak.kaist.ac.kr>, tjyu@eve.kaist.ac.kr (Yu TaiJung) writes:
> Does anybody have document of .RTF file or know where I can get it?
> 
> Thanks in advance. :)

I got one from Microsoft tech support.

-- 
Sterling G. Bjorndahl, bjorndahl@Augustana.AB.CA or bjorndahl@camrose.uucp
Augustana University College, Camrose, Alberta, Canada      (403) 679-1100
 comp.os.ms-windows.misc


## Preprocess text

- split metadata and text.
- For metadata

    - Pick only Subject from metadata.

- For text
    1. split into sentences
    2. lower case each sentence
    3. tokenize into words
    
- Tokenizer: 
    1. delete email address
    2. collection of numbers => NUM
    2. keep \$
    3. delete all other punctuations

- Maintain a word list for words that appear more than once.

In [205]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
# split metadata and text
def split_metadata(data):
    occurrence = data.find("\n\n")
    metadata = data[0:occurrence] + "\n"
    text = data[occurrence+2:]
    return metadata, text
# get subject from metadata
def get_subject(metadata):
    # subject start with Subject: end with \n
    regex = 'Subject: (.*)\n'
    match = re.search(regex, metadata)
    return match.group(1)
# delete email address
def del_email(text):
    regex = '\S*@\S*'
    return re.sub(regex, " EMAIL ", text)
# replace number collection
def replace_num(text):
    regex = '[0-9]+'
    return re.sub(regex, " NUM ", text)
# remove special characters
def remove(text):
    regex = '[^\w\s$]'
    return re.sub(regex," ", text)
# process raw text
def process_text(text):
    return [lemmatizer.lemmatize(word) for word in word_tokenize(remove(replace_num(del_email(text.lower()))))\
            if word not in stop_words]

def process_data(data):

    metadata, text = split_metadata(data)
    
    subject = get_subject(metadata)
    
    tokenized_text = []
    for sent in sent_tokenize(text):
        tokenized_text += process(sent)
        
    tokenized_subject = []
    for sent in sent_tokenize(subject):
        tokenized_subject += process(sent)
    return tokenized_subject, tokenized_text

In [207]:
tokenized_subject = []
tokenized_text = []
for data in raw_text:
    subject, text = process_data(data)
    tokenized_subject.append(subject)
    tokenized_text.append(text)

print(tokenized_subject[0])
print(tokenized_text[1])

['car']
['fair', 'number', 'brave', 'soul', 'upgraded', 'si', 'clock', 'oscillator', 'shared', 'experience', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'experience', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'card', 'adapter', 'heat', 'sink', 'hour', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'NUM', 'NUM', 'NUM', 'floppy', 'especially', 'requested', 'summarizing', 'next', 'two', 'day', 'please', 'add', 'network', 'knowledge', 'base', 'done', 'clock', 'upgrade', 'answered', 'poll', 'thanks', 'guy', 'kuo', 'EMAIL']


In [208]:
tokenized_test_subject = []
tokenized_test_text = []
for data in test_text:
    subject, text = process_data(data)
    tokenized_test_subject.append(subject)
    tokenized_test_text.append(text)

print(tokenized_test_subject[0])
print(tokenized_test_text[1])

['need', 'info', 'NUM', 'NUM', 'bonneville']
['familiar', 'format', 'x', 'face', 'thingies', 'seeing', 'folk', 'header', 'got', 'see', 'maybe', 'make', 'one', 'got', 'dpg', 'view', 'linux', 'box', 'display', 'uncompressed', 'x', 'face', 'managed', 'compile', 'un', 'compface', 'looking', 'seem', 'find', 'x', 'face', 'anyones', 'news', 'header', 'could', 'would', 'please', 'send', 'x', 'face', 'header', 'know', 'probably', 'get', 'little', 'swamped', 'handle', 'hope', 'rick', 'miller', 'EMAIL', 'EMAIL', 'ricxjo', 'muelisto', 'send', 'postcard', 'get', 'one', 'back', 'enposxtigu', 'bildkarton', 'kaj', 'vi', 'ricevos', 'alion', 'rick', 'miller', 'NUM', 'wood', 'muskego', 'wi', 'NUM', 'usa']


In [209]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
# Create a corpus from a list of texts
dictionary = Dictionary(tokenized_text)
dictionary.filter_extremes(no_below=3)
id2token = {dictionary.token2id[key]:key for key in dictionary.token2id}
corpus = [dictionary.doc2bow(text) for text in tokenized_text]

In [211]:
lda = LdaModel(corpus, num_topics=40)

In [212]:
for topicid in range(40):
    words = lda.get_topic_terms(topicid, 10)
    print("\nTOPIC", topicid)
    for word in words:
        print(id2token[word[0]], word[1])


TOPIC 0
key 0.027637761
phone 0.00797428
pgp 0.0062342826
get 0.006120797
division 0.0056556957
chip 0.0054952595
know 0.0052064387
bit 0.0050913943
rsa 0.005068707
session 0.004716287

TOPIC 1
israel 0.01053801
law 0.009119934
one 0.008596661
people 0.008404893
israeli 0.007079925
jew 0.0064443494
food 0.0052900016
msg 0.0051528616
state 0.005066714
article 0.004532835

TOPIC 2
would 0.008945552
article 0.007668558
get 0.0071309935
one 0.006503978
like 0.0060583386
time 0.005537521
use 0.0044212183
think 0.004137352
make 0.0036717732
people 0.003663801

TOPIC 3
people 0.012566865
militia 0.008943163
amendment 0.008261151
right 0.008076568
bear 0.007920167
state 0.00783085
article 0.007772535
arm 0.0073593035
would 0.0071598855
second 0.006968814

TOPIC 4
ax 0.599545
max 0.043476943
f 0.03361659
q 0.03059271
$ 0.02973423
g 0.029133486
v 0.025313847
r 0.013342343
p 0.013195112
b 0.01280684

TOPIC 5
leaf 0.007704196
player 0.0068196985
$ 0.006501871
article 0.0060061654
would 0.00570362

In [213]:
test_corpus = [dictionary.doc2bow(text) for text in tokenized_test_text]

In [217]:
lda.get_document_topics(test_corpus[0])

[(6, 0.4316437), (28, 0.46394077), (30, 0.08185455)]

In [219]:
latent_training_feature = 

From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)
Subject: Need info on 88-89 Bonneville
Organization: University at Buffalo
Lines: 10
News-Software: VAX/VMS VNEWS 1.41
Nntp-Posting-Host: ubvmsd.cc.buffalo.edu


 I am a little confused on all of the models of the 88-89 bonnevilles.
I have heard of the LE SE LSE SSE SSEI. Could someone tell me the
differences are far as features or performance. I am also curious to
know what the book value is for prefereably the 89 model. And how much
less than book value can you usually get them for. In other words how
much are they in demand this time of year. I have heard that the mid-spring
early summer is the best time to buy.

			Neil Gandler

