# Setup

After collecting URLs, via e.g. 2.1_Requests_Peoples_Daily, this script collects the text content.

It uses multi-threading so runs much faster than other approaches. 

It also contains some very basic preprocessing, two functions, which are essential at a foundational level. Still, considering moving those.

NEXT STEP: make into a py file as well for future seemless running of requests for url and this for text collection.

In [1]:
import requests
from bs4 import BeautifulSoup as soup
import pandas as pd
from fake_useragent import UserAgent
import pandas as pd
import codecs

import concurrent.futures # for multi-threading
import time

import os
from os import listdir
from os.path import isfile, join

from urllib3.exceptions import ReadTimeoutError
from requests.exceptions import ProxyError, HTTPError, ConnectionError


In [2]:
OUTPUT_PATH =  './DATA/'
INPUT_PATH = './INPUT/'

In [3]:
# Make output path folder if not exists (no need to do for input, must exist, create that elsewhere)
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH) 

In [None]:
# Utility function, I used toffigure out which csv to use for URL_FILE if I have lots
csvfiles = [f for f in listdir(INPUT_PATH) if isfile(join(INPUT_PATH, f))]
print('Index, Filename')
print(list(zip([index for index, value in enumerate(csvfiles)], csvfiles)))

In [5]:
URL_FILE = 'peoples_request_data.csv'

In [6]:
OUTPUT_FILE = "peoples_daily_collected.csv"

In [7]:
# Initializing an empty df with column names for continuous saving during collection

if not os.path.exists(OUTPUT_PATH + OUTPUT_FILE):
    df = pd.DataFrame(columns=['title','url', 'date','text'])
    df.to_csv(OUTPUT_PATH + OUTPUT_FILE, index=False)

# Main collection function

In [8]:
def sub_scrape(url, title):
    main = 'http://www.people.com.cn/'
    print('Scraping ' + url)

    ua = UserAgent()
    user_agent = ua.random
    headers = {'User-Agent': user_agent}

    try:
        with requests.get(url, headers= headers, timeout=10) as response:
           page_soup = soup(response.content, 'lxml', from_encoding=response.encoding) #"iso-8859-1"

    except HTTPError:
        print('HTTPError. Passing...')
        body_no+=1
        body = 'HTTPError'
        date = 'HTTPError'
        
        with codecs.open(OUTPUT_PATH + OUTPUT_FILE, 'a', 'utf-8',) as f:
            f.write(title + ',' + url + ',' + date + ',' + body + '\n')
        
        pass            
    
    except ConnectionError as exc:
        if 'Errno 113' in str(exc):
            print('Dead link. Passing...')
            body_no+=1
            body = 'DEAD'
            date = 'DEAD'
                            
            with codecs.open(OUTPUT_PATH + OUTPUT_FILE, 'a', 'utf-8',) as f:
                f.write(title + ',' + url + ',' + date + ',' + body + '\n')
            
            pass

        if 'Errno -3' in str(exc):
            print('Connection error...sleeping for 2 minutes')
            connection_retry()

    except ReadTimeoutError:
            print('TIMEOUT. Passing...')
            body_no+=1
            body = 'TIMEOUT'
            date = 'TIMEOUT'
                            
            with codecs.open(OUTPUT_PATH + OUTPUT_FILE, 'a', 'utf-8',) as f:
                f.write(title + ',' + url + ',' + date + ',' + body + '\n')
            
    except requests.exceptions.Timeout: 
            print('TIMEOUT. Passing...')
            body_no+=1
            body = 'TIMEOUT'
            date = 'TIMEOUT'
                            
            with codecs.open(OUTPUT_PATH + OUTPUT_FILE, 'a', 'utf-8',) as f:
                f.write(title + ',' + url + ',' + date + ',' + body + '\n')


    time.sleep(1)

    if response is None:
        print('Empty response..skipping')

    else:
        print('Response good from ' + response.url)

    if response.url == 'http://www.people.cn/404/error.html' or response.url == main:
        print('Page deleted by website owner. Passing...')
        body = '404'
        date = '404'

    else:
        try:
            classes = ['box_con', 'text', 'show_text', 'rm_txt_con cf', 'content', 'rwb_zw', 'rm_txt_con cf',
                        'content clear clearfix']

            # Text collection
            for c in classes:
                body = page_soup.find('div', class_=c)
                try:
                    body = body.text
                    break
                except:
                    pass

                try:
                    body = body.replace(",","").strip()
                    body = body.replace("\n", " ")
                except:
                    body = 'EMPTY'

            # Title collection not need as it is in original request data; left here just in case needed for some edge case
            # try:
            #     title = page_soup.find_all('h1')

            #     if len(title) == 1:
            #         title = title[0].text
            #     else:
            #         title = [t.text for t in title]
            #         title = max(title, key=len)

            # except:
            #     title = page_soup.find('h2')
            #     try:
            #         title = title.text 
            #     except:
            #         title = 'EMPTY'
            
            # Date collection deleted, this can be gotten from url much easier
            # Per my practice, to be done AFTER scrape
        
        except:
            print('Whoops ... FAIL') # yeah, unlikely to ever hit this, which is why its so casual
    

    with codecs.open(OUTPUT_PATH + OUTPUT_FILE, 'a') as f: # previously also used response.encoding parameter, gives issues if using row.title though
        # Keep this
        title = title.strip()
        title = title.replace(",","").strip()
        title = title.replace("\n", " ")

        body = body.strip()
        body = body.replace(","," ").strip()
        body = body.replace("\n", " ")

        # Just a filler per the above; may remove all later
        date = 'PLACEHOLDER'
        
        f.write(title + ',' + url + ',' + date + ',' + body + '\n')

    print('Thread done')
    time.sleep(2)

In [9]:
## For testing
# for index, row in df.iterrows():
#     sub_scrape(row.url, row.title)

# Load url data

In [10]:
df = pd.read_csv(INPUT_PATH + URL_FILE, encoding='utf-8')

In [11]:
# We don't need all the columns
# Some could have value for your research though, so now would be the point to check and think whether to include
# TO DO: add subtitle, merge with title column though

df = df[['title', 'url']]

In [13]:
## For testing
#df = df[:20]

# Main Scrape

In [14]:
def batch(df, size=10):
      size = size
      
      l = list(range(size, len(df), size))
      l_mod = [0] + l + [max(l) + size]
      list_of_dfs = [df.iloc[l_mod[n]:l_mod[n+1]] for n in range(len(l_mod)-1)]
            
      return list_of_dfs


In [15]:
list_of_dfs = batch(df)

In [None]:
counter = 1 

for df in list_of_dfs:

      no_threads = 10


      with concurrent.futures.ThreadPoolExecutor(max_workers=no_threads) as executor:
            
            for index, row in df.iterrows():
                  print(f"Thread starting for: {row.url}")
                  
                  executor.submit(sub_scrape, row.url, row.title)
      
      print('Batch done: ' + str(counter))

      counter +=1 

      time.sleep(10)


# Processing - Maybe move to other script, but this is pretty basic and essential

### Load collected data

In [17]:
df = pd.read_csv(OUTPUT_PATH + OUTPUT_FILE) 

### Create Dates

In [20]:
# Peoples Daily has the date in url - this extracts it, saving a step elsewhere
# Note: range has to be edited as needed! Recall top range needs to be +1 

def date_from_url(url):
      years = list(range(2000, 2025, 1))
      for year in years:
            year = str(year)
            split_on ='/' + year + '/'

            if split_on in url:
                  date_start = url.split(split_on)[-1]
                  date_mid = date_start.split('/')[0]
                  month = date_mid[:2]
                  day = date_mid[2:]
                  date = month + '-' +  day + '-' + year

                  # failsafe, but do not recall why needed - I trust my old self though, will try to check/follow-up
                  if len(date) != 10:
                        date = '01-01-' + year
                  else:
                        return date
            else:
                  pass

In [None]:
df['date'] = df['url'].apply(lambda url: date_from_url(url))

### Convert Encoding

In [23]:
def convert(x):
    try:
        text = x.encode('iso-8859-1').decode('EUC-CN', 'ignore')
        return text
    except:
        return x
        print(x)
    
    # Legacy: will delete later after I recall why I checked or used this at one time
    #body = body.decode(response.encoding, "strict").encode("utf8", "strict")

In [24]:
# Apply convert function, title's are fine unless you maybe recollect

df['text'] = df['text'].apply(lambda x: convert(x)) 

In [None]:
# leaving this final check in here

df.tail(5)

### Final save

In [29]:
# Overwriting file here! But this is a rare exception: original file without data and encoding is useless

CLEANED_OUTPUT = "peoples_daily_collected.csv"

In [None]:
df.to_csv(OUTPUT_PATH + CLEANED_OUTPUT, index=False)