In [1]:
import gc
import gzip
import time
import json
import shutil
import os
import sys
import tldextract
import collections
import pandas as pd
from tqdm import tqdm
import urllib.request

# had to add:
from multiprocessing import Pool

In [None]:
import random

In [36]:
storage_folder = '../data/raw/'
remote_file_prefix = 'https://commoncrawl.s3.amazonaws.com/'

In [10]:
index_file_name = 'cc-index.paths.gz'
file_name = storage_folder + index_file_name
file_unzipped = file_name.split('.gz')[0]

In [11]:
file_unzipped

'../data/raw/cc-index.paths'

In [12]:
os.path.isfile(file_unzipped)

True

In [13]:
if not os.path.isfile(file_unzipped):
    with gzip.open(file_name, 'rb') as f_in:
        with open(file_unzipped, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

In [14]:
def read_every_line(fname,
                    max_lines=-1):
    lines = []
    with open(fname, encoding='utf-8') as f:
        for i, l in enumerate(f):
            lines.append(l)
            if i>max_lines and max_lines>0:
                break
    return lines

In [39]:
idx_lines = read_every_line(file_unzipped, 1e8)
num_lines = len(idx_lines)
print('{} lines extracted'.format(num_lines))
idx_lines = [line.replace('\n','') for line in idx_lines]

302 lines extracted


In [29]:
cdx_lines[71]

'cc-index/collections/CC-MAIN-2019-47/indexes/cdx-00071.gz'

In [46]:
i = random.choice(range(num_lines))
idx_root = cdx_lines[i]
idx_file = idx.split('/')[-1]
# file_dict[os.path.join(storage_folder, cc_index_file)] = file_prefix + cc_index
local_file_name = os.path.join(storage_folder, idx_file)
url = remote_file_prefix + idx_root

In [47]:
idx_root

'cc-index/collections/CC-MAIN-2019-47/indexes/cdx-00047.gz'

In [45]:
idx_file

'cdx-00144.gz'

In [48]:
# file_dict
local_file_name, url

('../data/raw/cdx-00144.gz',
 'https://commoncrawl.s3.amazonaws.com/cc-index/collections/CC-MAIN-2019-47/indexes/cdx-00047.gz')

In [35]:
file_dict

OrderedDict([('../data/raw/cdx-00097.gz',
              'https://commoncrawl.s3.amazonaws.com/cc-index/collections/CC-MAIN-2019-47/indexes/cdx-00097.gz')])

In [50]:
def reporthook(count, block_size, total_size):
    global start_time
    if count == 0:
        start_time = time.time()
        return
    duration = time.time() - start_time
    progress_size = int(count * block_size)
    speed = int(progress_size / (1024 * duration))
    percent = int(count * block_size * 100 / total_size)
    sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" %
                    (percent, progress_size / (1024 * 1024), speed, duration))
    sys.stdout.flush()

def save(url, filename):
    urllib.request.urlretrieve(url, filename, reporthook)

In [51]:
# print('PROCESSING INDEX FILE [{}]/[{}] ...'.format(i,len(file_dict)))
print('PROCESSING INDEX FILE ...')
print('Downloading index file {} ...'.format(local_file_name))
save(url, local_file_name)

PROCESSING INDEX FILE ...
Downloading index file ../data/raw/cdx-00144.gz ...
...100%, 776 MB, 635 KB/s, 1252 seconds passed

In [52]:
def process_index_file(file_name):
    print('Unzipping index file ... ')

    df_name = file_name.replace('.gz','.feather')
    file_unzipped = file_name.split('.gz')[0]

    if not os.path.isfile(file_unzipped):
        with gzip.open(file_name, 'rb') as f_in:
            with open(file_unzipped, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    lines = read_every_line(file_unzipped, 1e8)
    print('{} lines extracted'.format(len(lines)))

    print('Pre-processing index lines ... ')
    out = list_multiprocessing(lines, process_index_file_line, workers=4)
#     out = []

#     for line in lines:
#         out.append(process_index_file_line(line))

    # filter out blank lines
    out =  [_ for _ in out if _ != ()]

    print('Index pre-processed!')

    print('Processing index dataframe ... ')

    ts_list       = [_[0] for _ in out]
    url_list      = [_[1] for _ in out]
    tld           = [_[2] for _ in out]
    # length_list   = [_[3] for _ in out]
    # offset_list   = [_[4] for _ in out]
    # warc_list     = [_[5] for _ in out]
    # language_list = [_[6] for _ in out]

    cols = ['ts','url']#,'tld','length','offset','warc','language']
    df = pd.DataFrame(data={
        'ts':ts_list,
        'url':url_list,
#         'tld':tld,
#         'length':length_list,
#         'offset':offset_list,
#         'warc':warc_list,
#         'language':language_list
    }
                      ,columns=cols)

#     df['wet'] = df.warc.apply(lambda x: x.replace('/warc/','/wet/').replace('.warc.','.warc.wet.'))
#     df['wet'] = df['wet'].apply(lambda x: file_prefix + x)


#     os.remove(file_name)
#     os.remove(file_unzipped)
#     print('Files removed ... ')

    df = df.dropna().drop_duplicates().reset_index(drop=True)
    print('Index dataframe is ready!')

    print('Saving Dataframe ... ')    
    df.to_feather(df_name)
    print('Dataframe saved ... ')

In [53]:
def process_index_file_line(line):
    assert type(line)==str

    try:
        lst = line.replace('\n','').split()
        ts = lst[1]
        data = json.loads(line.replace('\n','').split(ts)[-1].strip())
    except:
        return ()

    if data['status'] != '200':
        return ()
    else:
#         try:
#             language = data['languages']
#         except:
#             language = 'none'

        try:
            _tldextract = tldextract.extract(data['url'])
            tup = (ts,
                   data['url'],
                   _tldextract.suffix,
                   # data['length'],
                   # data['offset'],
                   # data['filename'],
#                    language
                )
            return tup
        except:
            return ()

In [57]:
def list_multiprocessing(param_lst,
                         func,
                         **kwargs):

    workers = kwargs.pop('workers')

    with Pool(workers) as p:
        apply_lst = [([params], func, i, kwargs) for i,params in enumerate(param_lst)]
        result = list(tqdm(p.imap(_apply_lst, apply_lst), total=len(apply_lst)))

    # lists do not need such sorting, but this can be useful later
    result=sorted(result,key=lambda x:x[0])
    return [_[1] for _ in result]


def _apply_lst(args):
    params, func, num, kwargs = args
    return num, func(*params,**kwargs)

In [58]:
start_time = time.time()
# print('Starting at ', start_time)
print('Processing index file...')
process_index_file(local_file_name)
gc.collect()
duration = int((time.time() - start_time) / 60)
print('Elapsed time {} min'.format(duration))

Starting at  1575747070.7111099
Processing index file...
Unzipping index file ... 
12032727 lines extracted
Pre-processing index lines ... 


100%|██████████| 12032727/12032727 [10:52<00:00, 18438.18it/s]


Index pre-processed!
Processing index dataframe ... 
Index dataframe is ready!
Saving Dataframe ... 
Dataframe saved ... 
Elapsed time 106 min


In [40]:
df.shape

(8113086, 8)

In [42]:
import re

In [48]:
d = 'http://18.222.110.0/general-dentistry/'
re.search(r'\d', d) is not None

True

In [49]:
mini_df = df[:10]

In [50]:
mini_df

Unnamed: 0,ts,url,tld,length,offset,warc,language,wet
0,20191121011733,http://13.126.102.0:8081/,,3914,939069,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...
1,20191119195841,http://18.222.110.0/bruxism/,,30118,1127143,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...
2,20191119211958,http://18.222.110.0/children/,,28678,1353987,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...
3,20191119195846,http://18.222.110.0/facial-aesthetics/,,29722,1025626,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...
4,20191119202823,http://18.222.110.0/general-dentistry/,,32010,1108906,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...
5,20191119211031,http://18.222.110.0/orthodontics/,,31933,1382548,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...
6,20191119195855,http://18.222.110.0/smile-makeovers/,,29301,1805231,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...
7,20191121143531,http://39.106.117.0/2019/09/22/hello-world/,,20320,2568454,crawl-data/CC-MAIN-2019-47/segments/1573496670...,"zho,eng",https://commoncrawl.s3.amazonaws.com/crawl-dat...
8,20191121143530,http://39.106.117.0/robots.txt,,623,22456,crawl-data/CC-MAIN-2019-47/segments/1573496670...,none,https://commoncrawl.s3.amazonaws.com/crawl-dat...
9,20191111224255,http://45.163.117.0:5661/sipweb/trabalhador/lo...,,2933,2471191,crawl-data/CC-MAIN-2019-47/segments/1573496664...,por,https://commoncrawl.s3.amazonaws.com/crawl-dat...


In [52]:
import numpy as np

In [54]:
mini_df['url_has_numbers'] = np.where(mini_df['url'].str.contains(r'\d'), 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [55]:
mini_df

Unnamed: 0,ts,url,tld,length,offset,warc,language,wet,url_has_numbers
0,20191121011733,http://13.126.102.0:8081/,,3914,939069,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...,1
1,20191119195841,http://18.222.110.0/bruxism/,,30118,1127143,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...,1
2,20191119211958,http://18.222.110.0/children/,,28678,1353987,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...,1
3,20191119195846,http://18.222.110.0/facial-aesthetics/,,29722,1025626,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...,1
4,20191119202823,http://18.222.110.0/general-dentistry/,,32010,1108906,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...,1
5,20191119211031,http://18.222.110.0/orthodontics/,,31933,1382548,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...,1
6,20191119195855,http://18.222.110.0/smile-makeovers/,,29301,1805231,crawl-data/CC-MAIN-2019-47/segments/1573496670...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...,1
7,20191121143531,http://39.106.117.0/2019/09/22/hello-world/,,20320,2568454,crawl-data/CC-MAIN-2019-47/segments/1573496670...,"zho,eng",https://commoncrawl.s3.amazonaws.com/crawl-dat...,1
8,20191121143530,http://39.106.117.0/robots.txt,,623,22456,crawl-data/CC-MAIN-2019-47/segments/1573496670...,none,https://commoncrawl.s3.amazonaws.com/crawl-dat...,1
9,20191111224255,http://45.163.117.0:5661/sipweb/trabalhador/lo...,,2933,2471191,crawl-data/CC-MAIN-2019-47/segments/1573496664...,por,https://commoncrawl.s3.amazonaws.com/crawl-dat...,1


In [56]:
df['url_has_numbers'] = np.where(df['url'].str.contains(r'\d'), 1, 0)

In [57]:
df.sample(10)

Unnamed: 0,ts,url,tld,length,offset,warc,language,wet,url_has_numbers
1665204,20191117011531,https://cws.agency/robots.txt,agency,888,2422723,crawl-data/CC-MAIN-2019-47/segments/1573496668...,none,https://commoncrawl.s3.amazonaws.com/crawl-dat...,0
449162,20191119124630,https://www.superu.ad/artiach-galeta-sense-suc...,ad,47081,1003821030,crawl-data/CC-MAIN-2019-47/segments/1573496670...,cat,https://commoncrawl.s3.amazonaws.com/crawl-dat...,1
8044329,20191120021437,https://catalogo.biblio.unc.edu.ar/Search/Resu...,edu.ar,7375,343077730,crawl-data/CC-MAIN-2019-47/segments/1573496670...,"eng,spa",https://commoncrawl.s3.amazonaws.com/crawl-dat...,1
2659709,20191122191105,https://xelta.do.am/index/europai_unio_tanacsa...,am,13002,1067358257,crawl-data/CC-MAIN-2019-47/segments/1573496671...,hun,https://commoncrawl.s3.amazonaws.com/crawl-dat...,1
745156,20191114122217,https://easyshopping.ae/4-pcs-hair-brush-black...,ae,15707,386762379,crawl-data/CC-MAIN-2019-47/segments/1573496668...,eng,https://commoncrawl.s3.amazonaws.com/crawl-dat...,1
4138891,20191112181007,https://www.bumeran.com.ar/empleos/jr-para-aud...,com.ar,22152,757494735,crawl-data/CC-MAIN-2019-47/segments/1573496665...,spa,https://commoncrawl.s3.amazonaws.com/crawl-dat...,1
4174606,20191115054759,https://www.cachitadeco.com.ar/productos/50-of...,com.ar,13664,713241060,crawl-data/CC-MAIN-2019-47/segments/1573496668...,spa,https://commoncrawl.s3.amazonaws.com/crawl-dat...,1
6251484,20191117111510,https://iluminacionda.mercadoshops.com.ar/sist...,com.ar,14074,483936286,crawl-data/CC-MAIN-2019-47/segments/1573496668...,spa,https://commoncrawl.s3.amazonaws.com/crawl-dat...,1
349682,20191115051119,https://omgv.academy/heroku/,academy,18853,558554563,crawl-data/CC-MAIN-2019-47/segments/1573496668...,deu,https://commoncrawl.s3.amazonaws.com/crawl-dat...,0
8105238,20191122235549,http://odo.biblio.unc.edu.ar/cgi-bin/koha/opac...,edu.ar,38786,124561181,crawl-data/CC-MAIN-2019-47/segments/1573496672...,"eng,spa",https://commoncrawl.s3.amazonaws.com/crawl-dat...,1


In [58]:
df['url_has_numbers'].sum()

5422212

In [59]:
df['url_has_numbers'].sum() / len(df)

0.668329165991831

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8113086 entries, 0 to 8113085
Data columns (total 9 columns):
ts                 object
url                object
tld                object
length             object
offset             object
warc               object
language           object
wet                object
url_has_numbers    int32
dtypes: int32(1), object(8)
memory usage: 526.1+ MB
