In [1]:
import gc
import gzip
import time
import json
import shutil
import os
import sys
import tldextract
import collections
import pandas as pd
from tqdm import tqdm
import urllib.request

# had to add:
from multiprocessing import Pool

In [3]:
storage_folder = '../data/raw/index_paths/'
file_prefix = 'https://commoncrawl.s3.amazonaws.com/'

`cc-index.paths` file has already been downloaded & extracted

In [4]:
index_file_name = 'cc-index.paths.gz'
file_name = storage_folder + index_file_name
file_unzipped = file_name.split('.gz')[0]

In [5]:
file_unzipped

'../data/index_paths/cc-index.paths'

In [7]:
def read_every_line(fname,
                    max_lines=-1):
    lines = []
    with open(fname, encoding='utf-8') as f:
        for i, l in enumerate(f):
            lines.append(l)
            if i>max_lines and max_lines>0:
                break
    return lines

In [8]:
lines = read_every_line(file_unzipped, 1e8)

In [9]:
print('{} lines extracted'.format(len(lines)))
lines = [line.replace('\n','') for line in lines]

302 lines extracted


`lines` is a list of url suffixes for compressed cdx files:

In [10]:
lines[0]

'cc-index/collections/CC-MAIN-2019-47/indexes/cdx-00000.gz'

Code to generate url and filename from this list:

In [11]:
file_dict = collections.OrderedDict()

n = 1

# iterate over the index files
for i, cc_index in enumerate(lines[:n]):
    cc_index_file = cc_index.split('/')[-1]
    file_dict[os.path.join(storage_folder, cc_index_file)] = file_prefix + cc_index

Code to extract data from the lines of a cdx index file:

In [13]:
def process_index_file_line(line):
    assert type(line)==str
    
    try:
        lst = line.replace('\n','').split()
        ts = lst[1] # some sort of identifier
        data = json.loads(line.replace('\n','').split(ts)[-1].strip())
    except:
        return ()
    
    if data['status'] != '200':
        return ()
    else:
        try:
            language = data['languages']
        except:
            language = 'none'
            
        try:
            _tldextract = tldextract.extract(data['url'])
            tup = (ts,
                   data['url'],
                   _tldextract.suffix,
                   data['length'],
                   data['offset'],
                   data['filename'],
                   language              
                )
            return tup
        except:
            return ()

Let's process the first 100 lines 

In [18]:
file_name = 'cdx-00000.gz'
df_name = file_name.replace('.gz','.feather')
file_unzipped = file_name.split('.gz')[0]

In [20]:
file_unzipped_path = storage_folder + file_unzipped

In [21]:
file_unzipped_path

'../data/index_paths/cdx-00000'

In [22]:
lines = read_every_line(file_unzipped_path, 100)

In [23]:
lines[0]

'0,0,1)/ 20191118114721 {"url": "http://1.0.0/", "mime": "text/html", "mime-detected": "text/html", "status": "403", "digest": "BSLJKUCKYURNLM3IWKP45ZYNTS3AISL6", "length": "1902", "offset": "14531", "filename": "crawl-data/CC-MAIN-2019-47/segments/1573496669755.17/crawldiagnostics/CC-MAIN-20191118104047-20191118132047-00107.warc.gz"}\n'

In [24]:
print('{} lines extracted'.format(len(lines)))

102 lines extracted


In [28]:
out = []

for line in lines:
    out.append(process_index_file_line(line))

In [31]:
out =  [_ for _ in out if _ != ()]

In [33]:
out[:5]

[('20191121011733',
  'http://13.126.102.0:8081/',
  '',
  '3914',
  '939069',
  'crawl-data/CC-MAIN-2019-47/segments/1573496670643.58/warc/CC-MAIN-20191121000300-20191121024300-00085.warc.gz',
  'eng'),
 ('20191119195841',
  'http://18.222.110.0/bruxism/',
  '',
  '30118',
  '1127143',
  'crawl-data/CC-MAIN-2019-47/segments/1573496670255.18/warc/CC-MAIN-20191119195450-20191119223450-00094.warc.gz',
  'eng'),
 ('20191119211958',
  'http://18.222.110.0/children/',
  '',
  '28678',
  '1353987',
  'crawl-data/CC-MAIN-2019-47/segments/1573496670255.18/warc/CC-MAIN-20191119195450-20191119223450-00465.warc.gz',
  'eng'),
 ('20191119195846',
  'http://18.222.110.0/facial-aesthetics/',
  '',
  '29722',
  '1025626',
  'crawl-data/CC-MAIN-2019-47/segments/1573496670255.18/warc/CC-MAIN-20191119195450-20191119223450-00406.warc.gz',
  'eng'),
 ('20191119202823',
  'http://18.222.110.0/general-dentistry/',
  '',
  '32010',
  '1108906',
  'crawl-data/CC-MAIN-2019-47/segments/1573496670255.18/warc/CC-

In [34]:
len(out)

71

In [35]:
ts_list       = [_[0] for _ in out]
url_list      = [_[1] for _ in out]
tld           = [_[2] for _ in out]
length_list   = [_[3] for _ in out]
offset_list   = [_[4] for _ in out]
warc_list     = [_[5] for _ in out]
language_list = [_[6] for _ in out]

cols = ['ts','url','tld','length','offset','warc','language']
df = pd.DataFrame(data={
    'ts':ts_list,
    'url':url_list,
    'tld':tld,
    'length':length_list,
    'offset':offset_list,
    'warc':warc_list,
    'language':language_list}
                  ,columns=cols)

df['wet'] = df.warc.apply(lambda x: x.replace('/warc/','/wet/').replace('.warc.','.warc.wet.'))
df['wet'] = df['wet'].apply(lambda x: file_prefix + x)

In [36]:
df.head()

Unnamed: 0,ts,url,tld,length,offset,warc,language,wet
0,20191121011733,http://13.126.102.0:8081/,,3914,939069,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...
1,20191119195841,http://18.222.110.0/bruxism/,,30118,1127143,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...
2,20191119211958,http://18.222.110.0/children/,,28678,1353987,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...
3,20191119195846,http://18.222.110.0/facial-aesthetics/,,29722,1025626,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...
4,20191119202823,http://18.222.110.0/general-dentistry/,,32010,1108906,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...


In [37]:
lines = read_every_line(file_unzipped_path, 1e8)
print('{} lines extracted'.format(len(lines)))

out = []

for line in lines:
    out.append(process_index_file_line(line))

out =  [_ for _ in out if _ != ()]
print('{} lines extracted'.format(len(out)))

ts_list       = [_[0] for _ in out]
url_list      = [_[1] for _ in out]
tld           = [_[2] for _ in out]
length_list   = [_[3] for _ in out]
offset_list   = [_[4] for _ in out]
warc_list     = [_[5] for _ in out]
language_list = [_[6] for _ in out]

cols = ['ts','url','tld','length','offset','warc','language']
df = pd.DataFrame(data={
    'ts':ts_list,
    'url':url_list,
    'tld':tld,
    'length':length_list,
    'offset':offset_list,
    'warc':warc_list,
    'language':language_list}
                  ,columns=cols)

df['wet'] = df.warc.apply(lambda x: x.replace('/warc/','/wet/').replace('.warc.','.warc.wet.'))
df['wet'] = df['wet'].apply(lambda x: file_prefix + x)

9815044 lines extracted
8113086 lines extracted


In [38]:
df.head()

Unnamed: 0,ts,url,tld,length,offset,warc,language,wet
0,20191121011733,http://13.126.102.0:8081/,,3914,939069,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...
1,20191119195841,http://18.222.110.0/bruxism/,,30118,1127143,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...
2,20191119211958,http://18.222.110.0/children/,,28678,1353987,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...
3,20191119195846,http://18.222.110.0/facial-aesthetics/,,29722,1025626,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...
4,20191119202823,http://18.222.110.0/general-dentistry/,,32010,1108906,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...


In [40]:
df.shape

(8113086, 8)

Same # of lines extracted after removing blanks

Save df:

In [39]:
df_name = file_name.replace('.gz','.feather')
df = df.dropna().drop_duplicates().reset_index(drop=True)
df.to_feather(df_name)

Curious that the first 5 all contain IP address as domain name

Check how many URLs this is true for:

In [42]:
import re

In [48]:
d = 'http://18.222.110.0/general-dentistry/'
re.search(r'\d', d) is not None

True

In [49]:
mini_df = df[:10]

In [52]:
import numpy as np

In [56]:
df['url_has_numbers'] = np.where(df['url'].str.contains(r'\d'), 1, 0)

In [58]:
df['url_has_numbers'].sum()

5422212

In [59]:
df['url_has_numbers'].sum() / len(df)

0.668329165991831

66% of URLs have a number in them.

Ultimately, decided to choose an index file at random (vs. taking the first one)