In [64]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)
pd.set_option('display.width', 999)
import glob
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')
import re

In [87]:
businessFileList = sorted(glob.glob('data/BBC News Summary/News Articles/business/*.txt'))
entertainmentFileList = sorted(glob.glob('data/BBC News Summary/News Articles/entertainment/*.txt'))
politicsFileList = sorted(glob.glob('data/BBC News Summary/News Articles/politics/*.txt'))
techFileList = sorted(glob.glob('data/BBC News Summary/News Articles/tech/*.txt'))

In [128]:
businessSummFileList = sorted(glob.glob('data/BBC News Summary/Summaries/business/*.txt'))
entertainmentSummFileList = sorted(glob.glob('data/BBC News Summary/Summaries/entertainment/*.txt'))
politicsSummFileList = sorted(glob.glob('data/BBC News Summary/Summaries/politics/*.txt'))
techSummFileList = sorted(glob.glob('data/BBC News Summary/Summaries/tech/*.txt'))

In [76]:
def readData(fileList):
    corpus = []

    for filePath in fileList:
        with open(filePath) as f_input:
            corpus.append(f_input.read()) 
    return corpus

In [84]:
business_corpus = readData(businessFileList)
entertainment_corpus = readData(entertainmentFileList)
politics_corpus = readData(politicsFileList)
tech_corpus = readData(techFileList)

In [130]:
businesssumm_corpus = readData(businessSummFileList)
entertainmentsumm_corpus = readData(entertainmentSummFileList)
politicssumm_corpus = readData(politicsSummFileList)
techsumm_corpus = readData(techSummFileList)

In [85]:
businessDf = pd.DataFrame(np.array(business_corpus), columns = ['text'])
entertainmentDf = pd.DataFrame(np.array(entertainment_corpus), columns = ['text'])
politicsDf = pd.DataFrame(np.array(politics_corpus), columns = ['text'])
techDf = pd.DataFrame(np.array(tech_corpus), columns = ['text'])

In [131]:
businessSummDf = pd.DataFrame(np.array(businesssumm_corpus), columns = ['text'])
entertainmentSummDf = pd.DataFrame(np.array(entertainmentsumm_corpus), columns = ['text'])
politicsSummDf = pd.DataFrame(np.array(politicssumm_corpus), columns = ['text'])
techSummDf = pd.DataFrame(np.array(techsumm_corpus), columns = ['text'])

In [86]:
def review_to_wordlist(text, remove_stopwords=True ):
    # 1. Remove HTML
    review_text = BeautifulSoup(text).get_text()
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    if len(words) == 0:
        words = ['NULL']
    return( " ".join(words ))

In [140]:
num_articles = politicsDf.shape[0]
documents_text = []
for i in range(0, num_articles):
    documents_text.append(review_to_wordlist(politicsDf["text"][i] ) )
    
politicsDf['cleaned_text'] = pd.Series(documents_text)

In [142]:
num_articles = techSummDf.shape[0]
documents_text = []
for i in range(0, num_articles):
    documents_text.append(review_to_wordlist(techSummDf["text"][i] ) )
    
techSummDf['cleaned_text'] = pd.Series(documents_text)

In [133]:
businessSummDf.to_csv('BusinessSummary_Cleaned.csv')

In [141]:
politicsDf.to_csv('Politics_Cleaned.csv')

In [135]:
entertainmentSummDf.to_csv('EntertainmentSummary_Cleaned.csv')

In [143]:
techSummDf.to_csv('TechSummary_Cleaned.csv')

In [121]:
documents = businessDf.cleaned_text.values.astype('U')
no_features = 10000
no_topics = 20
no_top_words = 30

In [122]:
"""
Summary:
    Displays all the relevant topics related to a given topic name
"""
def display_relevant_topics(model, feature_names, no_top_words):
    feature_names_list = []
    topic_id_list = []
    for topic_idx, topic in enumerate(model.components_):
        topic_id = topic_idx
        features = ", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]) 
        topic_id_list.append(topic_id)
        feature_names_list.append(features)
    
    topic_df = pd.DataFrame({'Topic_ID':topic_id_list,'Topics':feature_names_list})
    return topic_df

tf_vectorizer = CountVectorizer(max_df=0.95, max_features=no_features, stop_words='english', min_df=2)
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online',\
                                learning_offset=50.,random_state=1)
lda_new = lda.fit(tf)
lda_topic_df = display_relevant_topics(lda_new, tf_feature_names, no_top_words)
lda_topic_df.to_csv('Business_topics.csv')

In [126]:
business_topics = pd.read_csv('Business_topics.csv')
business_topics.drop(['Unnamed: 0', 'Topic_ID'], axis = 1)

Unnamed: 0,Topics
0,"food, ethiopia, tonnes, assistance, agriculture, crop, fao, crops, said, emergency, million, year, cereals, indirectly, prolonged, company, recommend, car, production, southern, calculated, india, domestic, locally, survival, new, contributed, recommended, mr, needed"
1,"mini, bmw, rover, cars, factory, ferrari, produced, cooper, oxford, maserati, car, launched, model, mg, cheapest, longbridge, fiat, automotive, new, invested, initially, said, brand, waiting, producing, mercedes, bn, euros, romeo, machinery"
2,"boeing, orange, brokers, airbus, telecom, lr, planes, said, fsa, contracts, commissions, aircraft, pentagon, lloyd, plane, customers, france, colour, ms, force, mr, orders, new, centres, bn, insurance, company, air, airlines, contract"
3,"fao, dollar, said, agricultural, developing, bn, subsidies, euro, countries, commodity, trade, current, situated, reasonably, deficit, slide, tariffs, firm, market, analysts, economy, growth, record, conclude, fundamentals, liberalise, barriers, lies, yen, hands"
4,"said, jobs, year, growth, economy, new, financial, economic, japan, january, low, month, president, world, job, unemployment, fall, card, deal, firm, bn, rise, firms, bank, oil, japanese, added, december, turkcell, high"
5,"beer, said, alternatives, breweries, draft, drink, drinks, market, year, bn, brewers, profits, brewery, turns, alcohol, launches, seawater, derived, prices, spirits, low, cost, mainstay, sweet, new, established, combines, taxed, jobs, nature"
6,"said, year, bn, fuel, quarter, rise, revenues, tax, government, reuters, airlines, ba, company, sales, economy, group, yukos, profits, cost, eu, chief, growth, firm, mr, costs, executive, long, ministers, new, aid"
7,"said, year, nigeria, people, government, work, quarter, million, growth, rates, economy, germany, figures, rate, production, rise, new, announced, german, jobless, unemployment, crop, total, rose, world, told, country, sector, statistics, hopes"
8,"fiat, gm, car, motors, engines, alfa, romeo, marchionne, auto, outright, alliance, agnelli, opel, said, recall, sell, maserati, rates, buy, wagoner, stake, sergio, chevrolet, vehicles, saab, ferrari, rick, carmaker, safety, gianni"
9,"lending, rupee, year, mortgage, bn, figures, said, bank, india, december, rating, november, grade, upgrade, market, approvals, currency, basically, rose, remittances, association, bba, sessions, rise, analysts, home, january, increase, level, romania"
