In [1]:
import requests
import os
from bs4 import BeautifulSoup
import hashlib
import pandas as pd

In [2]:
pathSM = "data/sitemap.xml"
if os.path.isfile(pathSM):
    print(pathSM, ": already done")
else:
    url = 'https://www.mottmac.com/sitemap.xml'
    r = requests.get(url, allow_redirects=True)
    open(pathSM, 'wb').write(r.content)

pathRobot = "data/robots.txt"
if os.path.isfile(pathRobot):
    print(pathRobot, ": already done")
else:
    url = 'https://www.mottmac.com/robots.txt'
    r = requests.get(url, allow_redirects=True)
    open(pathRobot, 'wb').write(r.content)

data/sitemap.xml : already done
data/robots.txt : already done


In [3]:

with open(pathSM, 'r') as f:
    data = f.read()
pages = BeautifulSoup(data, "xml")

In [4]:
urlList = []
URLs = pages.find_all('url')
for url in URLs:
    ref = url.find_all('xhtml:link', {'rel':'alternate'})[0]
    urlList.append(ref.get('href'))
urlList = list(set(urlList)) 
urlList = [x for x in urlList if not "/job" in x ]
urlList = [x for x in urlList if not "/download/" in x ]
urlList

['https://www.mottmac.com/releases/mott-macdonald-signs-placemaking-wales-charter',
 'https://www.mottmac.com/releases/mott-macdonald-supervising-construction-of-major-infrastructure-works-bahrain',
 'https://www.mottmac.com/article/11553/projects/empire-connector-pipeline',
 'https://www.mottmac.com/article/137/mott-macdonald-wins-two-new-contracts-on-us28',
 'https://www.mottmac.com/releases/mott-macdonald-response-to-uks-comprehensive-spending-review',
 'https://www.mottmac.com/article/13640/tisur-amarradero-f-port-expansion',
 'https://www.mottmac.com/article/41145/healthy-direction-for-medical-learning',
 'https://www.mottmac.com/en-US/releases/no-dig-show',
 'https://www.mottmac.com/article/12134/projects/rt-hon-herb-gray-parkway-project',
 'https://www.mottmac.com/en-US/views/moving-into-the-service-economy-with-a-systems-approach',
 'https://www.mottmac.com/article/722/mott-macdonald-appointed-design-engineer-for',
 'https://www.mottmac.com/views/climate-change-in-the-developin

In [6]:
def getTitle(row):
    if not os.path.exists("data/cache/"+row["hash"]):
        return ""
    try:
        with open("data/cache/"+row["hash"]) as fp:
            soup = BeautifulSoup(fp, 'html.parser')
        result = soup.title.text
    except:
        result = ""
    return result

def getKeywords(row):
    if not os.path.exists("data/cache/"+row["hash"]):
        return ""
    try:
        with open("data/cache/"+row["hash"]) as fp:
            soup = BeautifulSoup(fp, 'html.parser')
        result = soup.select_one('meta[name=keywords]')['content']
    except:
        result = ""
    return result

def getDescription(row):
    if not os.path.exists("data/cache/"+row["hash"]):
        return ""
    try:
        with open("data/cache/"+row["hash"]) as fp:
            soup = BeautifulSoup(fp, 'html.parser')
        result = soup.select_one('meta[name=description]')['content']
    except:
        result = ""
    return result

In [7]:
df = dict()
for url in urlList:
    df[url] = hashlib.md5(url.encode()).hexdigest()
df = pd.DataFrame.from_dict(df, orient='index').reset_index()
df.columns = ["url","hash"]
df

Unnamed: 0,url,hash
0,https://www.mottmac.com/about-us/working-towar...,612e735d41c5f43aa890819fa28ddbd6
1,https://www.mottmac.com/article/9910/beat-the-...,c7a858d66a637ca5062c80897b8bb6ec
2,https://www.mottmac.com/releases/mott-macdonal...,cb851e13e98ff6b196c066ec7cd50ac5
3,https://www.mottmac.com/releases/walney-offsho...,6cb3721effa98e06c54022839d10e569
4,https://www.mottmac.com/pakistan,beb04e7969a0b392ce88af2549827bdb
...,...,...
3980,https://www.mottmac.com/article/54099/making-s...,1ea5232d41ac2fac8fe8077196ec51bb
3981,https://www.mottmac.com/south-sudan,5d7d3e69a5c19867d215b1acf1b36352
3982,https://www.mottmac.com/en-US/article/24795/mo...,712ecc8b0e4c3605cadb7ff749af4601
3983,https://www.mottmac.com/releases/construction-...,95056e1e57a7389dd5a61c0b87b4edb8


In [8]:
for ix,row in df.iterrows():
    name = row["hash"]
    url = row["url"]

    if not os.path.exists("data/cache/"+name):
        page_sourced = requests.get(url).content 
        html_content = BeautifulSoup(page_sourced, "html.parser")
        with open("data/cache/"+name, 'w') as f:
            f.write(str(html_content))
        print(name,"saved")

07f2bdd50eb5e25caf25e3f7aca13c58 saved
728543718e7b6ba20349d2642406678e saved
a1bd1a9e467d8fe1f28b82d131302c15 saved
4b4a2658f69b7e818f925a96c9b9a357 saved
485d2a4bfffc03ef11fa6ca152fd2566 saved
850ab2d51cce80e7486afe7b3c9078c7 saved
c64e302e2986d3f6dec40dfd515ac188 saved
3f8df21a418858fbbaa6925817f3486c saved
4fd58cb636147970c043bdd0dd7ef8d4 saved
f5783aecfcade83a85bf5e366f712cfd saved
d62ea74b8d7b72353562a4c08912d020 saved
0fa14efa64c01cd3fc44f37518237290 saved
2d7e099af8d15f8a590ca0d7e2bb2b79 saved
53ae9843396e28cccf95975bd158a56c saved
e1acb55fa5295056f47aa926d540695e saved
c207c1b2065088293e0d582f15844fc8 saved
e6814a6331e25294e10d02680bec07d7 saved
1fc77c144e852b2db133853857542601 saved
2d57f7c4090201044d42d3571e7c3700 saved
469afcbea17a00f349f13bc00e9aec19 saved
92cb4fe23bbb72e6c49e623d9d676954 saved
f8307b91aa55ba0a730e38851b07407e saved
971e825bc1fd2ae8b8b3143dddb438c0 saved
7bdd6ba4ed856b20a897a63c175ae10a saved
74907cb90baf1c9bcf3aa894f11b7c61 saved
5b29ae55a091b1bab662fb3fd

In [None]:
if "title" not in list(df.columns):
    df["title"] = df.apply(lambda row:getTitle(row), axis=1)
    df["description"] = df.apply(lambda row:getDescription(row), axis=1)
    df["keywords"] = df.apply(lambda row:getKeywords(row), axis=1)
    df.to_parquet("data/pages.parquet.gzip")
df

# Cleaning the text

In [5]:
import os

import numpy as np
import pandas as pd
import io
from lxml import html
from bs4 import BeautifulSoup
import trafilatura

In [6]:
files = os.listdir('data/cache/')
file_names = []
for name in files:
    if not ('type' in name):
        file_names.append(name)

In [7]:
articles = []
errors = []
# Takes around 16s for 456 articles
# 2Min19 for 3000
for file_name in file_names:
    with io.open(f'data/cache/{file_name}', mode="r", encoding="utf-8") as f:
        try:
            mytree = html.fromstring("".join(f.readlines()))
        except Exception as e:
            print(e)
            errors.append(file_name)
            continue
        try:
            content = trafilatura.extract(mytree)
            articles.append((file_name, content))
        except Exception as e:
            print(e)
            errors.append(file_name)

Document is empty


In [8]:
len(dfContent)

NameError: name 'dfContent' is not defined

In [126]:
import re
from typing import Set
from transformers import GPT2TokenizerFast

import numpy as np
from nltk.tokenize import sent_tokenize

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def count_tokens(text: str) -> int:
    """count the number of tokens in a string"""
    return len(tokenizer.encode(text))

def reduce_long(
    long_text: str, long_text_tokens: bool = False, max_len: int = 590
) -> str:
    """
    Reduce a long text to a maximum of `max_len` tokens by potentially cutting at a sentence end
    """
    if not long_text_tokens:
        long_text_tokens = count_tokens(long_text)
    if long_text_tokens > max_len:
        sentences = sent_tokenize(long_text.replace("\n", " "))
        ntokens = 0
        for i, sentence in enumerate(sentences):
            ntokens += 1 + count_tokens(sentence)
            if ntokens > max_len:
                return ". ".join(sentences[:i][:-1]) + "."

    return long_text

In [10]:
dfContent = pd.DataFrame(articles, columns = ['hash', 'content'])
len(dfContent)

4126

In [11]:
dfContent.to_excel("pages.xlsx")

In [130]:
dfContent["content"] = dfContent["content"].apply(lambda x: reduce_long(x, 750))
dfContent["tokens"] = dfContent["content"].apply(lambda x: count_tokens(x))

In [131]:
dfContent.to_parquet("data/content.parquet.gzip")
dfContent

Unnamed: 0,hash,content,tokens
0,a8414fff120d9c5f9073376b27f52dcf,"Mott MacDonald divisional director, Anne Kerr,...",111
1,3b1ca0a60662c15c1cd6e6fa0206f4a0,A consortium including Cheung Kong Infrastruct...,341
2,fb558366bffeb1834d6fea64b41236ba,"Mott MacDonald, the global engineering, manage...",503
3,b0b1a2444987ebf09f8fe22a170cc6dd,We were proud to be platinum sponsor of The Ec...,584
4,e8f484cc2df87052b1617e79b3dee16a,Mott MacDonald offers a wide variety of traffi...,98
...,...,...,...
4001,24438270208be5df0f6c89e56b2b7926,"Mott MacDonald, assisted by Rand Europe, has b...",416
4002,7f41c6131fb54bc4522d9f760dee5c34,"Phil Vigor, principal airport planner The stea...",523
4003,d81b9d188f526726d84381fa7b041bff,Additional growth potential has been identifie...,140
4004,8397cf71f1d5a56f2815b41bf2a61088,Mott MacDonald and ADP have been appointed by ...,559
