In [33]:
import requests
import bs4

import json

import numpy as np
import pandas as pd

import re
import time

from tqdm import tqdm

from urllib.parse import urljoin

# Auxilliary functions

In [25]:
def get_soup(url):
    """
    Return a BeautifulSoup from the input `url`.
    """
    headers = {'User-Agent': 'Mozilla/5.0'}
    proxies = {'http': 'http://206.189.157.23'}
    soup = bs4.BeautifulSoup(
        requests.get(url, headers=headers, proxies=proxies).content
    )
    return soup

def get_lifestyle(url, cat):
    """
    Get an article in the lifestyle domain of inquirer.net based on `url` and
    `cat`.
    """
    title_list = []
    text_list = []
    link_list = []
    publish_time_list = []
    time.sleep(0.01)
    for fashion in [soup.find_all('h3', class_='title')
                    for soup in get_soup(url)
                    .find_all('div', id='primary')][0]:
        url_article = fashion.find('a').get('href')
        text_time = str(get_soup(url_article).find_all('div', class_="meta"))
        try:
            publish_time = re.findall('\d\d\:\d\d\sPM\s\w+\s\d\d\,\s\d{4}',
                                      text_time)[0]
            publish_time_list.append(publish_time)
        except:
            continue
        title = fashion.find('a').get_text()
        title_list.append(title)
        link_list.append(url_article)
        text = ' '.join([p.text.strip()
                         for p in get_soup(url_article)
                         .find_all('p')])
        text_list.append(text)

    df = pd.DataFrame()
    df['title'] = title_list
    df['link'] = link_list
    df['publish_time'] = publish_time_list
    df['text'] = text_list
    df['category'] = cat
    return df

def get_all_lifestyle(url, df, cat):
    """
    Get the article information based on a `url` and concatenate to `df` in
    the given category `cat` from the most recent until the start of 2018.
    """
    while True:
        try:
            time.sleep(0.01)
            url_next = (get_soup(url).find('a',
                                           class_='next page-numbers')
                        .get('href'))
            df_next = get_lifestyle(url_next, cat)
            df_nextnext = (pd.concat([df, df_next],
                                     ignore_index=True)
                           .drop_duplicates())
            if int(df_nextnext['publish_time'].values[-1][-4:]) >= 2018:
                df = df_nextnext.copy()
                url = url_next 
            else:
                break
        except:
            break
    return df_nextnext

def get_lifestyle_df(lifestyle):
    """
    Label the lifestyle articles by category and return the corresponding
    DataFrame.
    """
    df_lifestyle = pd.DataFrame()
    time.sleep(0.01)
    for life in list(lifestyle.keys()):
        url = lifestyle[life]
        cat = life
        df = get_lifestyle(url, cat)
        df_life = get_all_lifestyle(url, df, cat)
        df_lifestyle = pd.concat([df_lifestyle, df_life], ignore_index=True)
    return df_lifestyle

def get_all_lifestyle_search(url):
    """
    Collect all titles of articles from `url`.
    """
    title_list = []
    time.sleep(0.01)
    while True:
        try:
            soup = (get_soup(url).find_all('div',
                               class_="block-wrap block-wrap-24 block-css-0"
                               " block-wrap-classic block-wrap-no-9 elements"
                               "-design-1 block-skin-0 tipi-box block-mason"
                               "ry-style block-masonry-no-v block-masonry-"
                               "design-1 block-masonry-wrap clearfix")[0])
            more_title_list = [sabaw.get_text()
                               for sabaw in soup
                               .find_all('h3', class_="title")]
            title_list.extend(more_title_list)
            url = (get_soup(url).find('a',
                                     class_='inf-load-more block-loader'
                                     ' tipi-button tipi-button-border')
            .get('href'))
        except:
            break
    return title_list

def get_lifestyle_search(soup):
    """
    Extract the relevant information on an article based on `soup`.
    """
    title_list = []
    text_list = []
    link_list = []
    publish_time_list = []
    time.sleep(0.01)
    for fashion in soup.find_all('h3', class_='title'):
        url_article = fashion.find('a').get('href')
        text_time = str(get_soup(url_article).find_all('div', class_="meta"))
        try:
            publish_time = re.findall('\d\d\:\d\d\sPM\s\w+\s\d\d\,\s\d{4}',
                                      text_time)[0]
            publish_time_list.append(publish_time)
        except:
            continue
        title = fashion.find('a').get_text()
        title_list.append(title)
        link_list.append(url_article)
        text = ' '.join([p.text.strip() for p in get_soup(url_article)
                         .find_all('p')])
        text_list.append(text)

    df = pd.DataFrame()
    df['title'] = title_list
    df['link'] = link_list
    df['publish_time'] = publish_time_list
    df['text'] = text_list
    return df

def get_mentalhealth_search(url):
    """
    Get the relevant information of mental health related articles and
    return the corresponding DataFrame.
    """
    title_list = []
    link_list = []
    df = pd.DataFrame()
    time.sleep(0.01)
    while True:
        try:
            soup = (get_soup(url).find_all('div',
                               class_="block-wrap block-wrap-24 block-css-0"
                               " block-wrap-classic block-wrap-no-9 elements"
                               "-design-1 block-skin-0 tipi-box block-mason"
                               "ry-style block-masonry-no-v block-masonry-"
                               "design-1 block-masonry-wrap clearfix")[0])
            df_mental = get_lifestyle_search(soup)
            df = pd.concat([df, df_mental], ignore_index=True)
            if int(df['publish_time'].values[-1][-4:]) >= 2018:
                df = df.copy()
            else:
                break
            url = (get_soup(url).find('a',
                                     class_='inf-load-more block-loader'
                                     ' tipi-button tipi-button-border')
                   .get('href'))
        except:
            break
    return df

def create_database():
    """
    Create a SQL Database of the data in inquirer.csv as a table.
    """
    conn = sqlite3.connect("inquirer.db")
    c = conn.cursor()
    query = """
        CREATE TABLE IF NOT EXISTS
            inquirer (publish_time, title, link, text, mh_related)
    """
    c.execute(query)
    inq = pd.read_csv('inquirer.csv')
    inq.to_sql('inquirer.db', conn, if_exists='replace', index=False)
    conn.commit()
    return conn

# Web scraping

In [8]:
# Get category keys.
with open('inquirer_domains.json') as file:
    inq_dict = json.load(file)

lifestyle = inq_dict['lifestyle']

# Generate a dataframe of lifestyle articles in inquirer.
try:
    df_lifestyle = pd.read_csv('lifestyle.csv')
except:
    df_lifestyle = get_lifestyle_df(lifestyle)
    df_lifestyle['year'] = (df_lifestyle
                            .drop_duplicates()
                            ['publish_time']
                            .apply(lambda x: int(x[-4:])))
    df_lifestyle.loc[df_lifestyle['year'] >= 2018].to_csv('lifestyle.csv',
                                                          index=False)

In [6]:
url_search = 'https://lifestyle.inquirer.net/?s=mental+health'

# Get list of articles related to mental health.
try:
    df_mentalheath = pd.read_csv('mental_health_articles.csv')
except:
    df_mentalhealth = get_mentalhealth_search(url_search)
    df_mentalheath.to_csv('mental_health_articles.csv', index=False)

In [9]:
# Check mental health related articles that are in the corpus.
df_lifestyle.loc[df_lifestyle['title']
                 .isin(df_mentalheath['title']
                       .tolist()), 'mh_related'] = 1

df_lifestyle['mh_related'] = df_lifestyle['mh_related'].replace(np.nan, 0)
df_mentalheath['mh_related'] = 1

In [10]:
# Concatenate the dataframes and construct the complete corpus.
inq = pd.concat([df_lifestyle, df_mentalheath]).drop(columns=['category',
                                                              'year'])
inq.mh_related = inq.mh_related.astype(int)
inq = inq.drop_duplicates()

inq.publish_time = pd.to_datetime(inq.publish_time,
                                  format='%I:%M %p %B %d, %Y')

inq = (inq[['publish_time', 'title', 'link', 'text', 'mh_related']]
       .sort_values('publish_time', ascending=False).reset_index(drop=True))

with open('inquirer.csv', 'w') as f:
    inq.to_csv(f, index=False)