In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
import re, string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.datasets import fetch_20newsgroups

lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/josephtanner/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/josephtanner/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
data = fetch_20newsgroups(remove = ('headers', 'footers', 'qutes'))

In [3]:
news_df = pd.DataFrame({'Article': data.data})

In [4]:
news_df.head()

Unnamed: 0,Article
0,I was wondering if anyone out there could enli...
1,A fair number of brave souls who upgraded thei...
2,"well folks, my mac plus finally gave up the gh..."
3,Robert J.C. Kyanko (rob@rjck.UUCP) wrote:\n> a...
4,"From article <C5owCB.n3p@world.std.com>, by to..."


In [5]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {'J': wordnet.ADJ,
                'N': wordnet.NOUN,
                'V': wordnet.VERB,
                'R': wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [6]:
def cleaner(text):
    text = re.sub('@\S+', '', str(text).lower()) # remove user mentions
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text) # remove punctuation
    text = re.sub(r'http\S+', '', text) # remove http links
    tokens = word_tokenize(text) # tokenize the text
    tokens = [token for token in tokens if not token in stopwords and len(token) > 2]
    words = ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokens])
    
    return words

In [7]:
news_df['cleaned_text'] = news_df['Article'].apply(lambda x: cleaner(x))

In [8]:
news_df.head()

Unnamed: 0,Article,cleaned_text
0,I was wondering if anyone out there could enli...,wonder anyone could enlighten car saw day 2doo...
1,A fair number of brave souls who upgraded thei...,fair number brave soul upgraded clock oscillat...
2,"well folks, my mac plus finally gave up the gh...",well folk mac plus finally give ghost weekend ...
3,Robert J.C. Kyanko (rob@rjck.UUCP) wrote:\n> a...,robert kyanko rob write abraxis writes article...
4,"From article <C5owCB.n3p@world.std.com>, by to...",article c5owcbn3p tombaker tom baker article c...


In [9]:
news_df.reset_index(inplace=True)

In [10]:
news_df.rename(columns={'index': 'num'}, inplace=True)

In [11]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(news_df.cleaned_text)
words = np.array(vectorizer.get_feature_names())

In [12]:
nmf = NMF(n_components=20, random_state=1, solver='mu')
W = nmf.fit_transform(X)
H = nmf.components_



In [13]:
for i, topic in enumerate(H):
    print('Topic {}: {}'.format(i, ', '.join([str(x) for x in words[topic.argsort()[-5:]]])))

Topic 0: get, say, dont, people, one
Topic 1: do, use, run, program, window
Topic 2: play, win, player, team, game
Topic 3: escrow, clipper, encryption, chip, key
Topic 4: floppy, hard, scsi, disk, drive
Topic 5: believe, bible, christian, jesus, god
Topic 6: mail, anyone, email, please, thanks
Topic 7: palestinian, jew, arab, israeli, israel
Topic 8: color, monitor, video, driver, card
Topic 9: mile, price, dealer, engine, car
Topic 10: turk, turkey, armenia, turkish, armenian
Topic 11: include, new, price, offer, sale
Topic 12: firearm, crime, weapon, law, gun
Topic 13: ftp, directory, image, format, file
Topic 14: article, orbit, launch, nasa, space
Topic 15: help, like, anyone, appreciate, would
Topic 16: value, system, moral, morality, objective
Topic 17: eat, restaurant, chinese, food, msg
Topic 18: dog, rider, ride, motorcycle, bike
Topic 19: port, simms, speed, use, mac


In [14]:
def topic_table(model, feature_names, n_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        t = (topic_idx)
        topics[t] = [feature_names[i] for i in top_words(topic, n_top_words)]
    
    return pd.DataFrame(topics)


def top_words(topic, n_top_words):
    return topic.argsort()[:-n_top_words -1:-1]


def whitespace_tokenizer(text):
    pattern = r'(?u)\b\w\w+\b'
    tokenizer_regex = RegexpTokenizer(pattern)
    tokens = tokenizer_regex.tokenize(text)
    
    return tokens

def unique_words(text):
    ulist = []
    [ulist.append(x) for x in text if x not in ulist]
    
    return ulist

In [15]:
docweights = nmf.transform(vectorizer.transform(news_df.cleaned_text))

In [16]:
n_top_words = 8

topic_df = topic_table(nmf, words, n_top_words).T

topic_df['topics'] = topic_df.apply(lambda x: [' '.join(x)], axis = 1) # join each word into a list
topic_df['topics'] = topic_df['topics'].str[0] # remove list brackets
topic_df['topics'] = topic_df['topics'].apply(lambda x: whitespace_tokenizer(x)) # tokenize
topic_df['topics'] = topic_df['topics'].apply(lambda x: unique_words(x)) # remove duplicate words
topic_df['topics'] = topic_df['topics'].apply(lambda x: [' '.join(x)]) # rejoin into list
topic_df['topics'] = topic_df['topics'].str[0] # remove list brackets

In [17]:
topic_df

Unnamed: 0,0,1,2,3,4,5,6,7,topics
0,one,people,dont,say,get,think,writes,article,one people dont say get think writes article
1,window,program,run,use,do,application,problem,manager,window program run use do application problem ...
2,game,team,player,win,play,season,year,score,game team player win play season year score
3,key,chip,encryption,clipper,escrow,government,phone,system,key chip encryption clipper escrow government ...
4,drive,disk,scsi,hard,floppy,ide,controller,boot,drive disk scsi hard floppy ide controller boot
5,god,jesus,christian,bible,believe,faith,christ,belief,god jesus christian bible believe faith christ...
6,thanks,please,email,anyone,mail,post,address,advance,thanks please email anyone mail post address a...
7,israel,israeli,arab,jew,palestinian,lebanese,lebanon,state,israel israeli arab jew palestinian lebanese l...
8,card,driver,video,monitor,color,vga,mode,diamond,card driver video monitor color vga mode diamond
9,car,engine,dealer,price,mile,model,driver,oil,car engine dealer price mile model driver oil


In [18]:
topic_df = topic_df['topics'].reset_index()

In [19]:
topic_df.columns = ['topic_num', 'topics']

In [20]:
topic_df

Unnamed: 0,topic_num,topics
0,0,one people dont say get think writes article
1,1,window program run use do application problem ...
2,2,game team player win play season year score
3,3,key chip encryption clipper escrow government ...
4,4,drive disk scsi hard floppy ide controller boot
5,5,god jesus christian bible believe faith christ...
6,6,thanks please email anyone mail post address a...
7,7,israel israeli arab jew palestinian lebanese l...
8,8,card driver video monitor color vga mode diamond
9,9,car engine dealer price mile model driver oil


In [21]:
num = news_df['num'].tolist()

In [22]:
df_temp = pd.DataFrame({'num': num,
                      'topic_num': docweights.argmax(axis=1)})

In [23]:
merged_topic = df_temp.merge(topic_df, on='topic_num', how='left')

In [24]:
df_topics = pd.merge(news_df, merged_topic, on='num', how='left')

In [25]:
df_topics

Unnamed: 0,num,Article,cleaned_text,topic_num,topics
0,0,I was wondering if anyone out there could enli...,wonder anyone could enlighten car saw day 2doo...,9,car engine dealer price mile model driver oil
1,1,A fair number of brave souls who upgraded thei...,fair number brave soul upgraded clock oscillat...,6,thanks please email anyone mail post address a...
2,2,"well folks, my mac plus finally gave up the gh...",well folk mac plus finally give ghost weekend ...,0,one people dont say get think writes article
3,3,Robert J.C. Kyanko (rob@rjck.UUCP) wrote:\n> a...,robert kyanko rob write abraxis writes article...,3,key chip encryption clipper escrow government ...
4,4,"From article <C5owCB.n3p@world.std.com>, by to...",article c5owcbn3p tombaker tom baker article c...,0,one people dont say get think writes article
...,...,...,...,...,...
11309,11309,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,nyeda david nye neurology consultation cheaper...,0,one people dont say get think writes article
11310,11310,"I have a (very old) Mac 512k and a Mac Plus, b...",old mac 512k mac plus problem screen blank som...,4,drive disk scsi hard floppy ide controller boot
11311,11311,I just installed a DX2-66 CPU in a clone mothe...,instal dx266 cpu clone motherboard try mount c...,19,mac use speed simms port scsi modem memory
11312,11312,In article <1qkgbuINNs9n@shelley.u.washington....,article 1qkgbuinns9n bolson edward bolson writ...,0,one people dont say get think writes article


In [26]:
df_topics.topic_num.value_counts()

0     1976
2     1008
6      838
11     788
5      629
1      560
19     544
14     533
15     524
12     483
3      461
9      448
8      402
18     377
13     374
4      362
7      316
17     302
16     230
10     159
Name: topic_num, dtype: int64