In [34]:
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd, numpy as np
from inscriptis import get_text
import warnings, matplotlib.pyplot as plt
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary
from gensim.parsing.preprocessing import preprocess_string

In [7]:
def getsublink(link):
    z = urllib.request.urlopen(link).read().decode('utf-8')
    y = get_text(z)
    df = pd.DataFrame(columns=['link','data'])
    df = df.append({'link': link,'data': y}, ignore_index=True)
    resp = urllib.request.urlopen(link)
    soup = BeautifulSoup(resp, "lxml" , from_encoding=resp.info().get_param('charset'))
    try:
        for link in soup.find_all('a', href=True):
            if(link['href'][0] == 'h' and link['href'][1] == 't'):
                x = link['href']
                z = urllib.request.urlopen(x).read().decode('utf-8')
                y = get_text(z)
                df = df.append({'link': x,'data': y}, ignore_index=True)
    except:
        pass
    return df

output = getsublink("https://edition.cnn.com/2018/07/16/asia/india-whatsapp-lynching-intl/index.html")
output.head()

Unnamed: 0,link,data
0,https://edition.cnn.com/2018/07/16/asia/india-...,Breaking News\nWorld+\nU.S.AfricaAmericasAsiaA...
1,http://money.cnn.com/INTERNATIONAL/,* Markets\n * Economy\n * Companies\n * T...
2,http://money.cnn.com/technology/,\nPlease enable JavaScript to view CNN Tech.\n...
3,http://go.cnn.com/?stream=cnni&sr=watchLiveHPb...,
4,http://money.cnn.com/INTERNATIONAL/,* Markets\n * Economy\n * Companies\n * T...


In [23]:
hyperlinks = ['https://www.ci.san-bernardino.ca.us/','http://www.sbcounty.gov/main/default.aspx',
    'https://www.sbsun.com/local-news/','https://www.redlandsdailyfacts.com/location/california/san-bernardino-county/',
    'https://www.ci.san-bernardino.ca.us/news/', 'https://www.sbcusd.com/news/',
    'http://abc30.com/place/san-bernardino-county/','https://www.dailybulletin.com/location/california/san-bernardino-county/']

#get all dataframes for each hyperlink...
def get_dataframes(hyperlinks):
    lamb_getsublink = (lambda hyperlink: getsublink(hyperlink))
    print('before list comprehension')
    dataframes = [(hyperlink,lamb_getsublink(hyperlink)) for hyperlink in hyperlinks]
    print('after list comprehension')
    return dataframes

dataframes = get_dataframes(hyperlinks)

before list comprehension
after list comprehension


In [24]:
def append_dataframes(dataframes):
    new_dataframes = []
    for (hyperlink,dataframe) in dataframes:
        preprocsplit = (lambda rev: np.asarray(preprocess_string(rev)))
        text_list = dataframe['data'].apply(preprocsplit).values
        c_dict = Dictionary(text_list)
        c_corp = [c_dict.doc2bow(text) for text in text_list]
        lda = LdaModel(c_corp)
        topics = [lda.get_document_topics(doc) for doc in c_corp]
        topic_list,prob_list = [],[]
        for topic in topics:
            if(topic==[]): topic_list.append([]), prob_list.append([])
            else:
                topIDs,probIDs = list(zip(*topic))
                topWords = list(map(lambda topID: dict(c_dict)[int(topID)], topIDs))
                topic_list.append(topWords), prob_list.append(list(probIDs))
        dataframe['topics'],dataframe['probabilities'] = pd.Series(topic_list),pd.Series(prob_list)
        new_dataframes.append((hyperlink,dataframe))
    return new_dataframes
        
new_dataframes = append_dataframes(dataframes)

In [44]:
##output all pandas data into '.csv' file - plot relevant data and output to '.png' file
def output_csv(new_dataframes):
    for (hyperlink,dataframe) in new_dataframes:
        bad_chars = ['*', '[', ']',':',';','|','=',',','/','\\']
        l_hyper = ''.join([char for char in list(hyperlink) if char not in bad_chars])
        print(l_hyper)
        f_name,p_name = 'hyperlinkOutput/'+l_hyper+'.csv','hyperlinkOutput/'+l_hyper+'.png'
        dataframe.to_csv(f_name)
        topics,probs = dataframe['topics'].values,dataframe['probabilities'].values
        assert(len(topics)==len(probs))
        for index,_ in enumerate(topics):
            topic,prob = topics[index],probs[index]
            plt.bar(topic,prob)
        plt.xlabel('topics'),plt.ylabel('frequency index'),plt.savefig(p_name),plt.close()
    return

output_csv(new_dataframes)

httpswww.ci.san-bernardino.ca.us
httpwww.sbcounty.govmaindefault.aspx
httpswww.sbsun.comlocal-news
httpswww.redlandsdailyfacts.comlocationcaliforniasan-bernardino-county
httpswww.ci.san-bernardino.ca.usnews
httpswww.sbcusd.comnews
httpabc30.complacesan-bernardino-county
httpswww.dailybulletin.comlocationcaliforniasan-bernardino-county
