In [1]:
import pandas as pd;
import numpy as np;
import scipy as sp;
import sys;

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords;
import nltk;

from gensim.models import ldamodel
#from gensim.models.nmf import NMF
from gensim.utils import simple_preprocess
import gensim.corpora;

import sklearn;
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer;
from sklearn.decomposition import NMF;
from sklearn.preprocessing import normalize;

import pickle;

In [2]:
data = pd.read_csv('/Users/kdwoo/Documents/Jupyter/2019_CAU_NLP/abcnews-date-text.csv', error_bad_lines=False);

In [3]:
stop_words = stopwords.words('english')

In [4]:
data['publish_date'] = pd.to_datetime(data['publish_date'].astype(str), format = '%Y%m%d')
data['publish_date'] = pd.DatetimeIndex(data['publish_date']).year

In [5]:
data_text = data[['headline_text']];

In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data['headline_text']))

In [7]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [8]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [9]:
data_words_nostops = pd.Series(data_words_nostops)
data_lemmatized = data_words_nostops.apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])

In [10]:
data_words_nostops

0           [aba, decides, community, broadcasting, licence]
1            [act, fire, witnesses, must, aware, defamation]
2                [calls, infrastructure, protection, summit]
3                  [air, nz, staff, aust, strike, pay, rise]
4          [air, nz, strike, affect, australian, travellers]
5                    [ambitious, olsson, wins, triple, jump]
6                [antic, delighted, record, breaking, barca]
7          [aussie, qualifier, stosur, wastes, four, memp...
8             [aust, addresses, un, security, council, iraq]
9                   [australia, locked, war, timetable, opp]
10               [australia, contribute, million, aid, iraq]
11         [barca, take, record, robson, celebrates, birt...
12                           [bathhouse, plans, move, ahead]
13           [big, hopes, launceston, cycling, championship]
14                [big, plan, boost, paroo, water, supplies]
15                 [blizzard, buries, united, states, bills]
16         [brigadier, d

In [11]:
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])  # 불용어처리

#  빈도수 높은 키워드 처리
stop_words.extend([' court', 'home',' council', 'hunter', 'help', 'time', 'injure', 'national', 'build', 'end', 'bid', 'cup', 'un', 'come', 'security', 'volunteer', 'ship', 'crew', 'crowd', 'join', 'helicopter', 'across', 'museum', 'Italy', 'grind', 'asian', 'sa', 'miss', 'one', 'die', 'use', 'three', 'Darwin', 'vic', 'number', 'may', 'start', 'law', 'way', 'communities', 'order', 'check', 'major', 'india', 'focus', 'form', 'journalist', 'milk', 'nz', 'rank', 'cook', 'egypt', 'New', 'year', 'force', 'fail', 'dead', 'was', 'farmer', 'fruit', 'philippines', 'injury', 'nick'])
stop_words.extend(['fire', 'new', 'hobart', 'rural', 'world', 'boat', 'turn', 'flight', 'around', 'well', 'Find', 'two', 'adelaide', 'murder', 'first', 'make', 'body', 'probe', 'outback', 'tourism', 'baby', 'David', 'street', 'mass', 'hotel', 'Police', 'say', 'open', 'dog', 'go', 'welcome', 'president', 'announce', 'level', 'allow', 'highest','queensland', 'kill', 'crash', 'road', 'record', 'nt', 'hit', 'plane', 'toll', 'suspend', 'peninsula', 'afghan', 'recovery','man', 'perth', 'flood', 'people', 'prison', 'still', 'supply', 'siege', 'spark', 'summer', 'Michael', 'ops', 'large', 'flash', 'view', 'attack', 'back', 'mine', 'deal', 'fan', 'celebrate', 'target', 'hill', 'party', 'reveal', 'terrorism', 'video', 'pressure', 'remember', 'korea', 'indian', 'millions', 'drill', 'country', 'hour', 'podcast', 'leaders', 'thursday', 'abbott', 'tony', 'policy', 'agricultural', 'shorten', 'sach', 'day', 'years', 'show', 'teen', 'heat', 'sport', 'issue', 'free', 'australias', 'asbestos', 'compete','South', 'china', 'talk', 'appeal', 'labor', 'plant', 'peter', 'allegedly', 'begin', 'try', 'ice', 'native', 'alcohol', 'Australia', 'league', 'live', 'launch', 'campaign', 'benefit', 'update', 'stream', 'cabinet', 'document', 'bob','Test', 'drug', 'brisbane', 'international', 'british', 'double', 'treat', 'patient', 'ebola', 'Wa', 'bushfire', 'research', 'expansion', 'ready', 'old', 'release', 'paper', 'see'])

# 그 아래에서  빈도수 높은 키워드 처리
stop_words.extend(['call',  'queensland', 'melbourne', 'perth', 'thousands', 'alert', 'reveal', 'spark', 'amid', 'illegal', 'australian', 'price',  'brisbane', 'western', 'high', 'fan', 'prepare', 'british', 'battle', 'beach', 'wa', 'take',  'box', 'could',  'search', 'black', 'michael', 'week','man', 'day' ,'country', 'new', 'old', 'police', 'test',  'force', 'release', 'hobart', 'council', 'die', 'miss','say', 'south', 'was','fire', 'victoria', 'build','australia', 'court','find', 'fall','mine','attack', 'darwin', 'break', 'record', 'david', 'reflect', 'remember','adelaide', 'show'])

#stop_words

In [12]:
data_lemmatized = remove_stopwords(data_lemmatized)

In [13]:
temp = data
temp['lemmatize'] = data_lemmatized
for i in range(15):
    globals()['trend{}'.format(i+2003)] = temp.loc[temp.publish_date == i+2003]

In [14]:
trend_list = [trend2003, trend2004, trend2005, trend2006, trend2007, trend2008, trend2009, trend2010,
              trend2011, trend2012, trend2013, trend2014, trend2015, trend2016, trend2017]
#print(trend_list)

In [15]:
#pickle.dump(data_text, open('2003_data_text.dat', 'wb'))

In [16]:
#train_headlines = [value[0] for value in data_text.iloc[0:].values];
#print(train_headlines[0]);

In [30]:
num_topics = 10;
lemmatized = list(trend2017['lemmatize'])

In [31]:
#train_headlines_sentences = [' '.join(text) for text in train_headlines]
#print(train_headlines_sentences[0]);

In [32]:
vectorizer = CountVectorizer(analyzer='word', max_features=5000);
x_counts = vectorizer.fit_transform(' '.join(text) for text in list(trend2003['lemmatize']));

In [33]:
transformer = TfidfTransformer(smooth_idf=False);
x_tfidf = transformer.fit_transform(x_counts);

In [34]:
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)

In [35]:
#obtain a NMF model.
model = NMF(n_components=num_topics, init='nndsvd');

In [40]:
#fit the model
model.fit(xtfidf_norm)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0,
  max_iter=200, n_components=10, random_state=None, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)

In [52]:
model.components_[0].argsort()[:-20 - 1:-1]

array([4999, 4998, 4997, 4996, 4995, 4994, 4993, 4992, 4991, 4990, 4989,
       4988, 4987, 4986, 4985, 4984, 4983, 4982, 4981, 4980], dtype=int64)

In [49]:
model.components_[0][:-20 - 1:-1]

array([3.37014285, 0.22685139, 0.19492771, 0.12986697, 0.09437241,
       0.0908397 , 0.07772541, 0.06861983, 0.06785875, 0.06509755,
       0.06496493, 0.06108526, 0.06028505, 0.05443715, 0.05275734,
       0.05229953, 0.05213447, 0.04990521, 0.04848382, 0.04642985])

In [37]:
def get_nmf_topics(model, n_top_words):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    vocabulary = vectorizer.vocabulary_
    
    word_dict = {};
    for i in range(num_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-20 - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words;
    
    return pd.DataFrame(word_dict);

In [38]:
get_nmf_topics(model, 20)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10
0,charge,us,plan,govt,face,win,continue,iraq,car,fund
1,stab,soldier,development,nsw,trial,award,death,war,woman,boost
2,assault,iraqi,reject,urge,death,claim,rise,soldier,accident,seek
3,attempt,troop,consider,qld,pair,top,sydney,troop,hospital,water
4,sex,baghdad,house,claim,tough,lead,investigation,bush,fatal,get
5,lay,shoot,group,consider,accuse,title,fight,report,shoot,warn
6,sydney,military,water,reject,ban,tour,protest,howard,investigate,health
7,fraud,trade,management,feed,water,england,strike,pm,sydney,service
8,shoot,warn,protest,accuse,future,set,house,bomb,house,concern
9,bail,bomb,power,local,restrictions,stage,clean,downer,stab,report
