In [91]:
import pandas as pd
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines =False)

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [92]:
data['publish_date'] = pd.to_datetime(data['publish_date'].astype(str), format = '%Y%m%d')

In [93]:
data['publish_date'] = pd.DatetimeIndex(data['publish_date']).year

In [94]:
print(data.head(5))

   publish_date                                      headline_text
0          2003  aba decides against community broadcasting lic...
1          2003     act fire witnesses must be aware of defamation
2          2003     a g calls for infrastructure protection summit
3          2003           air nz staff in aust strike for pay rise
4          2003      air nz strike to affect australian travellers


In [95]:
data['headline_text'] = data.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)

In [97]:
stop = stopwords.words('english')
stop.extends(['interview', 'weather', 'abc', 'australia', 'australian'])
data['headline_text'] = data['headline_text'].apply(lambda x: [word for word in x if word not in (stop)])

In [99]:
print(data.head(5))

   publish_date                                      headline_text
0          2003   [aba, decides, community, broadcasting, licence]
1          2003    [act, fire, witnesses, must, aware, defamation]
2          2003     [g, calls, infrastructure, protection, summit]
3          2003          [air, nz, staff, aust, strike, pay, rise]
4          2003  [air, nz, strike, affect, australian, travellers]


In [102]:
data['headline_text'] = data['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])

In [103]:
print(data.head(5))

   publish_date                                      headline_text
0          2003       [aba, decide, community, broadcast, licence]
1          2003      [act, fire, witness, must, aware, defamation]
2          2003      [g, call, infrastructure, protection, summit]
3          2003          [air, nz, staff, aust, strike, pay, rise]
4          2003  [air, nz, strike, affect, australian, travellers]


In [104]:
tokenized_doc = data['headline_text'].apply(lambda x: [word for word in x if len(word) > 2])

In [105]:
print(tokenized_doc[:5])

0     [aba, decide, community, broadcast, licence]
1    [act, fire, witness, must, aware, defamation]
2       [call, infrastructure, protection, summit]
3            [air, staff, aust, strike, pay, rise]
4    [air, strike, affect, australian, travellers]
Name: headline_text, dtype: object


In [111]:
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)
data['headline_text'] = detokenized_doc

In [115]:
data['headline_text'][:5]

0     aba decide community broadcast licence
1     act fire witness must aware defamation
2      call infrastructure protection summit
3             air staff aust strike pay rise
4    air strike affect australian travellers
Name: headline_text, dtype: object

In [117]:
temp = data

In [127]:
for i in range(15):
    globals()['trend{}'.format(i+2003)] = temp.loc[temp.publish_date == i+2003]

In [142]:
trend_list = [trend2003, trend2004, trend2005, trend2006, trend2007, trend2008, trend2009, trend2010, trend2011, trend2012, trend2013, trend2014, trend2015, trend2016, trend2017]

In [148]:
def get_topics(year, components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print(str(year)+" Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])

In [149]:
for i, v in enumerate(trend_list):
    vectorizer = TfidfVectorizer(stop_words='english', 
    max_features= len(v)) # 상위 1,000개의 단어를 보존 
    X = vectorizer.fit_transform(v['headline_text'])
     # TF-IDF 행렬의 크기 확인
#     print("trend"+str(i+2003)+": ",X.shape)
    lda_model=LatentDirichletAllocation(n_components=5,learning_method='online')
    lda_top=lda_model.fit_transform(X)
    terms = vectorizer.get_feature_names()
    get_topics(i+2003, lda_model.components_,terms)

2003 Topic 1: [('charge', 361.87), ('warn', 277.09), ('cup', 249.3), ('world', 241.23), ('set', 241.0)]
2003 Topic 2: [('police', 249.95), ('home', 203.29), ('open', 199.97), ('drug', 188.48), ('probe', 184.98)]
2003 Topic 3: [('police', 445.67), ('seek', 262.58), ('qld', 247.23), ('say', 243.67), ('group', 240.08)]
2003 Topic 4: [('face', 352.53), ('court', 254.1), ('fund', 233.28), ('health', 231.18), ('concern', 223.18)]
2003 Topic 5: [('kill', 415.53), ('claim', 315.65), ('report', 295.58), ('attack', 262.86), ('miss', 237.02)]
2004 Topic 1: [('plan', 652.15), ('govt', 394.64), ('council', 382.67), ('urge', 344.86), ('water', 300.03)]
2004 Topic 2: [('fear', 222.92), ('strike', 217.86), ('child', 200.69), ('return', 187.58), ('push', 183.6)]
2004 Topic 3: [('police', 380.73), ('miss', 268.99), ('continue', 268.74), ('drug', 248.65), ('lead', 238.85)]
2004 Topic 4: [('man', 541.35), ('boost', 339.13), ('charge', 317.37), ('court', 258.09), ('minister', 248.5)]
2004 Topic 5: [('labor

In [141]:
lda_model=LatentDirichletAllocation(n_components=5,learning_method='online')

In [138]:
lda_top=lda_model.fit_transform(X)

In [139]:
terms = vectorizer.get_feature_names()

In [140]:
def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(lda_model.components_,terms)

Topic 1: [('police', 791.27), ('new', 631.75), ('plan', 568.03), ('man', 534.98), ('govt', 490.47)]
