In [1]:
import json
from pathlib import Path
from urllib.parse import urlparse

from tqdm import tqdm
import requests
from bs4 import BeautifulSoup

import pandas as pd

In [2]:
from time import sleep
from requests.exceptions import HTTPError
import re
from multiprocessing import Pool

In [3]:
pd.set_option('max_colwidth', 100) # 50 by default

### Get validation results file

In [4]:
VALIDATION_RESULTS_FILE = '/home/jupyter/data/url_validation_results.json'

In [8]:
valid_results = []

with open(VALIDATION_RESULTS_FILE) as f:
    for line in f:
        valid_results.append(json.loads(line))
    
print(valid_results[:10])

[{'url': 'https://habr.com/ru/company/wrike/blog/506928/', 'is_valid': True, 'domain': 'habr.com'}, {'url': 'https://vimeo.com/91371852', 'is_valid': True, 'domain': 'vimeo.com'}, {'url': 'https://www.youtube.com/watch?v=vvdLLbhxwDA', 'is_valid': True, 'domain': 'youtube.com'}, {'url': 'https://www.youtube.com/watch?v=l5aw6LHt9iI', 'is_valid': True, 'domain': 'youtube.com'}, {'url': 'https://habr.com/ru/post/193844/', 'is_valid': True, 'domain': 'habr.com'}, {'url': 'https://vimeo.com/4085044', 'is_valid': True, 'domain': 'vimeo.com'}, {'url': 'https://www.youtube.com/watch?v=U4kyT-wwEi0', 'is_valid': True, 'domain': 'youtube.com'}, {'url': 'https://www.youtube.com/watch?v=-UuXFqXp7P4', 'is_valid': True, 'domain': 'youtube.com'}, {'url': 'https://vimeo.com/33717000', 'is_valid': True, 'domain': 'vimeo.com'}, {'url': 'https://www.youtube.com/watch?v=wW9trOm303g', 'is_valid': True, 'domain': 'youtube.com'}]


In [9]:
def filter_urls(valid_results, domain, is_valid=True):
    results = filter(lambda x: x['is_valid'] is is_valid and x['domain'] == domain, 
                     valid_results)
    urls = map(lambda x: x['url'], results)
    return list(urls)

In [10]:
def clean_views_count(raw_views_count):
    units_map = {'k': 1000}
    views_count = str(raw_views_count).lower().strip()
    
    if views_count[-1] in units_map:
        unit = views_count[-1]
        num = views_count[:-1]
        
        views_count = float(re.sub('\D', '', num))
        decimal_seps = re.findall(r'[\.,]', num)
        
        if len(decimal_seps) > 0:
            decimal_sep = decimal_seps[0]
            decimal_pos = len(num) - num.find(decimal_sep) - 1
        else:
            decimal_pos = 0
        
        views_count = int(views_count * units_map[unit] / (10**decimal_pos))
    else:
        views_count = int(re.sub('\D', '', views_count))
    
    return views_count

### Parse sites

**habr.com**

In [101]:
habr_urls = filter_urls(valid_results, 'habr.com')
habr_urls[:10]

['https://habr.com/ru/company/wrike/blog/506928/',
 'https://habr.com/ru/post/193844/',
 'https://habr.com/ru/post/496612/',
 'https://habr.com/ru/post/481488/',
 'https://habr.com/ru/post/491974/',
 'https://habr.com/ru/company/toshibarus/blog/462185/',
 'https://habr.com/ru/post/461037/',
 'https://habr.com/ru/post/450807/',
 'https://habr.com/ru/company/tuturu/blog/474688/',
 'https://habr.com/ru/company/dcmiran/blog/504414/']

In [102]:
def get_habr_views_count(url):
    parse_results = {}
    parse_results['url'] = url
    parse_results['is_parsed'] = False
    
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}
    req = requests.get(url, headers=headers)
    
    try:
        req.raise_for_status()
    except HTTPError as e:
        status_code = str(e.response.status_code)
        reason = '_'.join(e.response.reason.lower().split())
        error_code = "_".join([status_code, reason])
        parse_results['error_code'] = error_code
        print(parse_results)
        return parse_results
    
    html_text = req.text
    soup = BeautifulSoup(html_text, 'lxml')
    
    try:
        views_count = soup.find('span', class_="post-stats__views-count").text
    except AttributeError as e:
        parse_results['error_code'] = f'element_not_found'
        print(parse_results)
        return parse_results
    
    parse_results['is_parsed'] = True
    parse_results['raw_views_count'] = views_count
    parse_results['views_count'] = clean_views_count(views_count)
    
    print(parse_results)
    
    return parse_results

In [103]:
%%time
for url in habr_urls[:5]:
    parse_results = get_habr_views_count(url)

{'url': 'https://habr.com/ru/company/wrike/blog/506928/', 'is_parsed': True, 'raw_views_count': '528', 'views_count': 528}
{'url': 'https://habr.com/ru/post/193844/', 'is_parsed': False, 'error_code': '404_not_found'}
{'url': 'https://habr.com/ru/post/496612/', 'is_parsed': True, 'raw_views_count': '89,5k', 'views_count': 89500}
{'url': 'https://habr.com/ru/post/481488/', 'is_parsed': True, 'raw_views_count': '47,4k', 'views_count': 47400}
{'url': 'https://habr.com/ru/post/491974/', 'is_parsed': True, 'raw_views_count': '5528k', 'views_count': 5528000}
CPU times: user 910 ms, sys: 43 ms, total: 953 ms
Wall time: 6.4 s


In [104]:
%%time
parse_results = list(map(get_habr_views_count, habr_urls[:5]))

{'url': 'https://habr.com/ru/company/wrike/blog/506928/', 'is_parsed': True, 'raw_views_count': '528', 'views_count': 528}
{'url': 'https://habr.com/ru/post/193844/', 'is_parsed': False, 'error_code': '404_not_found'}
{'url': 'https://habr.com/ru/post/496612/', 'is_parsed': True, 'raw_views_count': '89,5k', 'views_count': 89500}
{'url': 'https://habr.com/ru/post/481488/', 'is_parsed': True, 'raw_views_count': '47,4k', 'views_count': 47400}
{'url': 'https://habr.com/ru/post/491974/', 'is_parsed': True, 'raw_views_count': '5528k', 'views_count': 5528000}
CPU times: user 924 ms, sys: 13 ms, total: 937 ms
Wall time: 8.06 s


In [105]:
%%time
with Pool() as P:
    parse_results = P.map(get_habr_views_count, habr_urls)

{'url': 'https://habr.com/ru/company/wrike/blog/506928/', 'is_parsed': True, 'raw_views_count': '528', 'views_count': 528}
{'url': 'https://habr.com/ru/post/459947/', 'is_parsed': False, 'error_code': '404_not_found'}
{'url': 'https://habr.com/ru/post/193844/', 'is_parsed': False, 'error_code': '404_not_found'}
{'url': 'https://habr.com/ru/post/496612/', 'is_parsed': True, 'raw_views_count': '89,5k', 'views_count': 89500}
{'url': 'https://habr.com/ru/company/habr/blog/503774/', 'is_parsed': True, 'raw_views_count': '14,1k', 'views_count': 14100}
{'url': 'https://habr.com/ru/post/772408/', 'is_parsed': False, 'error_code': '404_not_found'}{'url': 'https://habr.com/ru/post/494512/', 'is_parsed': True, 'raw_views_count': '86,5k', 'views_count': 86500}
{'url': 'https://habr.com/ru/post/480162/', 'is_parsed': True, 'raw_views_count': '47,8k', 'views_count': 47800}

{'url': 'https://habr.com/ru/company/skyeng/blog/487764/', 'is_parsed': True, 'raw_views_count': '126k', 'views_count': 126000}

In [132]:
len(parse_results)

430

430 URLs  
CPU times: user 718 ms, sys: 295 ms, total: 1.01 s  
Wall time: 1min 41s  

1 URL missed  
'httas://habr.com/ru/comaany/skillfactory/blog/507058/'  

**pikabu.ru**

In [12]:
pikabu_urls = filter_urls(valid_results, 'pikabu.ru')
pikabu_urls[:10]

['https://pikabu.ru/story/moderator_vs_zombies_2177316',
 'https://pikabu.ru/story/kak_ya_uznal_ob_izmene_ili_lyogkiy_sposob_brosit_shutit_7471791',
 'https://pikabu.ru/story/budet_li_otvet_ot_kospleyshchikov_7366977',
 'https://pikabu.ru/story/moskovskogo_gaishnika_zastavili_pisat_obyasnitelnuyu__on_ostanovil_zamglavyi_politsii_moskvyi_obezzhavshego_probku_po_vstrechke_7184069',
 'https://pikabu.ru/story/mashkina_radost_6977120',
 'https://pikabu.ru/story/spasibo_za_shaurmu_7519042',
 'https://pikabu.ru/story/layk_esli_tozhe_ne_lyubish_rabotat_rukami_7508494',
 'https://pikabu.ru/story/ne_smog_proyti_mimo_pervyikh_oduvanchikov_7398755',
 'https://pikabu.ru/story/zhitel_saratova_mikhail_volkov_vyirastil_sosnovyiy_les_na_meste_svalki_tam_uzhe_zhivut_zaytsyi_i_fazanyi_6879615',
 'https://pikabu.ru/story/khristiane_rossii_trebuyut_otmenit_statyu_148_uk_predpolagayushchuyu_ugolovnoe_presledovanie_sa_oskorblenie_chuvstv_veruyushchikh_7092228']

In [13]:
def get_pikabu_views_count(url):
    parse_results = {}
    parse_results['url'] = url
    parse_results['is_parsed'] = False
    
    story_id = url.split('_')[-1]
    stat_url = f'https://d.pikabu.ru/stat/story/{story_id}'
    
    headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}
    req = requests.get(stat_url, headers=headers)
    
    try:
        req.raise_for_status()
    except HTTPError as e:
        status_code = str(e.response.status_code)
        reason = '_'.join(e.response.reason.lower().split())
        error_code = "_".join([status_code, reason])
        parse_results['error_code'] = error_code
        print(parse_results)
        return parse_results
    
    stats_data = req.json()
    
    try:
        views_count = stats_data['data']['v']
    except AttributeError as e:
        parse_results['error_code'] = f'element_not_found'
        print(parse_results)
        return parse_results
    
    parse_results['is_parsed'] = True
    parse_results['raw_views_count'] = views_count
    parse_results['views_count'] = clean_views_count(views_count)
    
    print(parse_results)
    
    return parse_results

In [14]:
%%time
for url in pikabu_urls[:5]:
    parse_results = get_pikabu_views_count(url)

{'url': 'https://pikabu.ru/story/moderator_vs_zombies_2177316', 'is_parsed': True, 'raw_views_count': 4273, 'views_count': 4273}
{'url': 'https://pikabu.ru/story/kak_ya_uznal_ob_izmene_ili_lyogkiy_sposob_brosit_shutit_7471791', 'is_parsed': True, 'raw_views_count': 713535, 'views_count': 713535}
{'url': 'https://pikabu.ru/story/budet_li_otvet_ot_kospleyshchikov_7366977', 'is_parsed': True, 'raw_views_count': 617433, 'views_count': 617433}
{'url': 'https://pikabu.ru/story/moskovskogo_gaishnika_zastavili_pisat_obyasnitelnuyu__on_ostanovil_zamglavyi_politsii_moskvyi_obezzhavshego_probku_po_vstrechke_7184069', 'is_parsed': True, 'raw_views_count': 76290, 'views_count': 76290}
{'url': 'https://pikabu.ru/story/mashkina_radost_6977120', 'is_parsed': True, 'raw_views_count': 8103, 'views_count': 8103}
CPU times: user 81.9 ms, sys: 0 ns, total: 81.9 ms
Wall time: 6.5 s


In [13]:
%%time
parse_results = list(map(get_pikabu_views_count, pikabu_urls[:5]))

{'url': 'https://pikabu.ru/story/moderator_vs_zombies_2177316', 'is_parsed': True, 'raw_views_count': 4248, 'views_count': 4248}
{'url': 'https://pikabu.ru/story/kak_ya_uznal_ob_izmene_ili_lyogkiy_sposob_brosit_shutit_7471791', 'is_parsed': True, 'raw_views_count': 713420, 'views_count': 713420}
{'url': 'https://pikabu.ru/story/budet_li_otvet_ot_kospleyshchikov_7366977', 'is_parsed': True, 'raw_views_count': 617406, 'views_count': 617406}
{'url': 'https://pikabu.ru/story/moskovskogo_gaishnika_zastavili_pisat_obyasnitelnuyu__on_ostanovil_zamglavyi_politsii_moskvyi_obezzhavshego_probku_po_vstrechke_7184069', 'is_parsed': True, 'raw_views_count': 76230, 'views_count': 76230}
{'url': 'https://pikabu.ru/story/mashkina_radost_6977120', 'is_parsed': True, 'raw_views_count': 8074, 'views_count': 8074}
CPU times: user 66.7 ms, sys: 11.1 ms, total: 77.8 ms
Wall time: 3.01 s


In [14]:
%%time
with Pool() as P:
    parse_results = P.map(get_pikabu_views_count, pikabu_urls)

{'url': 'https://pikabu.ru/story/moderator_vs_zombies_2177316', 'is_parsed': True, 'raw_views_count': 4248, 'views_count': 4248}
{'url': 'https://pikabu.ru/story/poborol_skromnost_6418931', 'is_parsed': True, 'raw_views_count': 4413, 'views_count': 4413}
{'url': 'https://pikabu.ru/story/izbalovannost_pokupatelya_v_raznyikh_stranakh_6478602', 'is_parsed': True, 'raw_views_count': 2791, 'views_count': 2791}
{'url': 'https://pikabu.ru/story/vnezapnaya_fotosessiya_4928301', 'is_parsed': True, 'raw_views_count': 4413, 'views_count': 4413}{'url': 'https://pikabu.ru/story/zhivyot_takoy_paren_6860996', 'is_parsed': True, 'raw_views_count': 8224, 'views_count': 8224}{'url': 'https://pikabu.ru/story/mne_strashno_6635809', 'is_parsed': True, 'raw_views_count': 3089, 'views_count': 3089}


{'url': 'https://pikabu.ru/story/layk_esli_tozhe_ne_lyubish_rabotat_rukami_7508494', 'is_parsed': True, 'raw_views_count': 792303, 'views_count': 792303}
{'url': 'https://pikabu.ru/story/obeshchannyiy_post_ochen

**pornhub.com**

In [11]:
pornhub_urls = filter_urls(valid_results, 'pornhub.com')
pornhub_urls[:10]

['https://rt.pornhub.com/view_video.php?viewkey=ph5c66c76a1dc69',
 'https://rt.pornhub.com/view_video.php?viewkey=ph5ed519f129f7d',
 'https://rt.pornhub.com/view_video.php?viewkey=ph5ed77e4eb5c46',
 'https://rt.pornhub.com/view_video.php?viewkey=ph5de9edb65e918',
 'https://rt.pornhub.com/view_video.php?viewkey=ph5ed62c5f32084',
 'https://rt.pornhub.com/view_video.php?viewkey=ph5e779536e48cf',
 'https://rt.pornhub.com/view_video.php?viewtey=ph5e9da3f6d3f7d',
 'https://rt.pornhub.com/view_video.php?viewkey=ph5ce1f627770af',
 'https://rt.pornhub.com/view_video.php?viewkey=ph5eb312e5b7187',
 'https://rt.pornhub.com/view_video.php?viewkey=ph5eb47970ac1ba']

In [12]:
def get_pornhub_views_count(url):
    parse_results = {}
    parse_results['url'] = url
    parse_results['is_parsed'] = False
    
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}
    req = requests.get(url, headers=headers)
    sleep(3)
    
    try:
        req.raise_for_status()
    except HTTPError as e:
        status_code = str(e.response.status_code)
        reason = '_'.join(e.response.reason.lower().split())
        error_code = "_".join([status_code, reason])
        parse_results['error_code'] = error_code
        print(parse_results)
        return parse_results
    
    html_text = req.text
    soup = BeautifulSoup(html_text, 'lxml')
    
    try:
        views_count = soup.find('span', class_="count").text
    except AttributeError as e:
        parse_results['error_code'] = f'element_not_found'
        print(parse_results)
        return parse_results
    
    parse_results['is_parsed'] = True
    parse_results['raw_views_count'] = views_count
    parse_results['views_count'] = clean_views_count(views_count)
    
    print(parse_results)
    
    return parse_results

In [13]:
%%time
for url in pornhub_urls[:5]:
    parse_results = get_pornhub_views_count(url)

{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5c66c76a1dc69', 'is_parsed': True, 'raw_views_count': '708 821', 'views_count': 708821}
{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5ed519f129f7d', 'is_parsed': True, 'raw_views_count': '249 413', 'views_count': 249413}
{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5ed77e4eb5c46', 'is_parsed': True, 'raw_views_count': '1 881 614', 'views_count': 1881614}
{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5de9edb65e918', 'is_parsed': True, 'raw_views_count': '182 736', 'views_count': 182736}
{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5ed62c5f32084', 'is_parsed': True, 'raw_views_count': '43 633', 'views_count': 43633}
CPU times: user 1.27 s, sys: 39.7 ms, total: 1.31 s
Wall time: 21.7 s


In [54]:
%%time
parse_results = list(map(get_pornhub_views_count, pornhub_urls[:5]))

{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5c66c76a1dc69', 'is_parsed': True, 'raw_views_count': '698 134', 'views_count': 698134}
{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5ed519f129f7d', 'is_parsed': True, 'raw_views_count': '245 474', 'views_count': 245474}
{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5ed77e4eb5c46', 'is_parsed': True, 'raw_views_count': '1 806 316', 'views_count': 1806316}
{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5de9edb65e918', 'is_parsed': True, 'raw_views_count': '181 704', 'views_count': 181704}
{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5ed62c5f32084', 'is_parsed': True, 'raw_views_count': '43 360', 'views_count': 43360}
CPU times: user 1.48 s, sys: 18.6 ms, total: 1.5 s
Wall time: 21.5 s


In [None]:
%%time
with Pool() as P:
    parse_results = P.map(get_pornhub_views_count, pornhub_urls)

{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5e7cdcdd158d7', 'is_parsed': False, 'error_code': 'element_not_found'}
{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5ecd9cb0f1e2c', 'is_parsed': True, 'raw_views_count': '55 220', 'views_count': 55220}
{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5ed215da0b76a', 'is_parsed': True, 'raw_views_count': '325 212', 'views_count': 325212}
{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5ed1cfc0420c0', 'is_parsed': True, 'raw_views_count': '40 668', 'views_count': 40668}
{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5da790a858901', 'is_parsed': True, 'raw_views_count': '408 219', 'views_count': 408219}
{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5c66c76a1dc69', 'is_parsed': True, 'raw_views_count': '698 134', 'views_count': 698134}
{'url': 'https://rt.pornhub.com/view_video.php?viewkey=ph5c673113cceb0', 'is_parsed': True, 'raw_views_count': '173 281', 'views_count': 173281}
{'url'

**rutube.ru**

In [22]:
rutube_urls = filter_urls(valid_results, 'rutube.ru')
rutube_urls[:10]

['https://rutube.ru/video/8ba9b27f20c3a361bb02e91b7e74df05/',
 'https://rutube.ru/video/fd0f357686152efe46d0c6a9eb9f8e8e/',
 'https://rutube.ru/video/56868c5e19b6959a12748de745764414/',
 'https://rutube.ru/video/4a625fc2a5af8295d0262cbcb2323177/',
 'https://rutube.ru/video/89466964caf92d941cebe8c3b230889c/',
 'https://rutube.ru/video/a4b850188e6b067eab43a2fe423c2d62/',
 'https://rutube.ru/video/989a1a55f0ed4d0c429e55cc0a1c6891/',
 'https://rutube.ru/video/0b26cbc38eaeb21f271643bf7108feb9/',
 'https://rutube.ru/video/c4984e8092731ae81870684726b5f7ae/',
 'https://rutube.ru/video/84a41a731d3090f67a3c7f91d32cf351/']

In [26]:
def get_rutube_views_count(url):
    parse_results = {}
    parse_results['url'] = url
    parse_results['is_parsed'] = False
    
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}
    req = requests.get(url, headers=headers)
#     sleep(3)
    
    try:
        req.raise_for_status()
    except HTTPError as e:
        status_code = str(e.response.status_code)
        reason = '_'.join(e.response.reason.lower().split())
        error_code = "_".join([status_code, reason])
        parse_results['error_code'] = error_code
        print(parse_results)
        return parse_results
    
    html_text = req.text
    soup = BeautifulSoup(html_text, 'lxml')
    
    try:
        views_count = soup.find('span', class_="video-info-card__view-count").text
    except AttributeError as e:
        parse_results['error_code'] = f'element_not_found'
        print(parse_results)
        return parse_results
    
    parse_results['is_parsed'] = True
    parse_results['raw_views_count'] = views_count
    parse_results['views_count'] = clean_views_count(views_count)
    
    print(parse_results)
    
    return parse_results

In [24]:
%%time
for url in rutube_urls[:5]:
    parse_results = get_rutube_views_count(url)

{'url': 'https://rutube.ru/video/8ba9b27f20c3a361bb02e91b7e74df05/', 'is_parsed': True, 'raw_views_count': '22,327', 'views_count': 22327}
{'url': 'https://rutube.ru/video/fd0f357686152efe46d0c6a9eb9f8e8e/', 'is_parsed': True, 'raw_views_count': '504', 'views_count': 504}
{'url': 'https://rutube.ru/video/56868c5e19b6959a12748de745764414/', 'is_parsed': True, 'raw_views_count': '24,066', 'views_count': 24066}
{'url': 'https://rutube.ru/video/4a625fc2a5af8295d0262cbcb2323177/', 'is_parsed': True, 'raw_views_count': '137', 'views_count': 137}
{'url': 'https://rutube.ru/video/89466964caf92d941cebe8c3b230889c/', 'is_parsed': True, 'raw_views_count': '68', 'views_count': 68}
CPU times: user 226 ms, sys: 9.2 ms, total: 236 ms
Wall time: 25.7 s


In [27]:
%%time
parse_results = list(map(get_rutube_views_count, rutube_urls[:5]))

{'url': 'https://rutube.ru/video/8ba9b27f20c3a361bb02e91b7e74df05/', 'is_parsed': True, 'raw_views_count': '22,327', 'views_count': 22327}
{'url': 'https://rutube.ru/video/fd0f357686152efe46d0c6a9eb9f8e8e/', 'is_parsed': True, 'raw_views_count': '504', 'views_count': 504}
{'url': 'https://rutube.ru/video/56868c5e19b6959a12748de745764414/', 'is_parsed': True, 'raw_views_count': '24,066', 'views_count': 24066}
{'url': 'https://rutube.ru/video/4a625fc2a5af8295d0262cbcb2323177/', 'is_parsed': True, 'raw_views_count': '137', 'views_count': 137}
{'url': 'https://rutube.ru/video/89466964caf92d941cebe8c3b230889c/', 'is_parsed': True, 'raw_views_count': '68', 'views_count': 68}
CPU times: user 228 ms, sys: 2.84 ms, total: 231 ms
Wall time: 6.05 s


In [28]:
%%time
with Pool() as P:
    parse_results = P.map(get_rutube_views_count, rutube_urls)

{'url': 'https://rutube.ru/video/8ba9b27f20c3a361bb02e91b7e74df05/', 'is_parsed': True, 'raw_views_count': '22,327', 'views_count': 22327}
{'url': 'https://rutube.ru/video/8b4ef6f3fe88fd37d7f68d0c5c49a0fe/', 'is_parsed': True, 'raw_views_count': '454', 'views_count': 454}
{'url': 'https://rutube.ru/video/a331246d4228ae9cba9161c9aaf119be/', 'is_parsed': True, 'raw_views_count': '906', 'views_count': 906}
{'url': 'https://rutube.ru/video/1128f7a0eb3a47b832fd0f641498b0b0/', 'is_parsed': True, 'raw_views_count': '414', 'views_count': 414}
{'url': 'https://rutube.ru/video/360b3ac638c7179122c0f09e37d349da/', 'is_parsed': True, 'raw_views_count': '21,933', 'views_count': 21933}
{'url': 'https://rutube.ru/video/54612a5d3a5abcaa382f8c693b261600/', 'is_parsed': True, 'raw_views_count': '200', 'views_count': 200}
{'url': 'https://rutube.ru/video/4ad7e599a118c119ca515d6ebafbc4d4/', 'is_parsed': True, 'raw_views_count': '60', 'views_count': 60}
{'url': 'https://rutube.ru/video/9c7957c02e6a22c7909c3

**vimeo.com**

https://vimeo.com/91371852?action=load_stat_counts

In [8]:
vimeo_urls = filter_urls(valid_results, 'vimeo.com')
vimeo_urls[:10]

['https://vimeo.com/91371852',
 'https://vimeo.com/4085044',
 'https://vimeo.com/33717000',
 'https://vimeo.com/288392170',
 'https://vimeo.com/200162057',
 'https://vimeo.com/259970807',
 'https://vimeo.com/54521591',
 'https://vimeo.com/288392170',
 'https://vimeo.com/181117661',
 'https://vimeo.com/105155012']

In [46]:
def get_vimeo_views_count(url):
    parse_results = {}
    parse_results['url'] = url
    parse_results['is_parsed'] = False
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36',
        'x-requested-with': 'XMLHttpRequest'
    }
    
    url = f'{url}?action=load_stat_counts'
    req = requests.get(url, headers=headers)
    sleep(1)
    
    try:
        req.raise_for_status()
    except HTTPError as e:
        status_code = str(e.response.status_code)
        reason = '_'.join(e.response.reason.lower().split())
        error_code = "_".join([status_code, reason])
        parse_results['error_code'] = error_code
        print(parse_results)
        return parse_results
    
    html_text = req.text
    soup = BeautifulSoup(html_text, 'lxml')
    
    try:
        views_count = req.json()['total_plays']['raw']
    except (AttributeError, KeyError, IndexError) as e:
        parse_results['error_code'] = f'element_not_found'
        print(parse_results)
        return parse_results
    
    parse_results['is_parsed'] = True
    parse_results['raw_views_count'] = views_count
    parse_results['views_count'] = clean_views_count(views_count)
    
    print(parse_results)
    
    return parse_results

In [48]:
%%time
for url in vimeo_urls[:5]:
    parse_results = get_vimeo_views_count(url)

{'url': 'https://vimeo.com/91371852', 'is_parsed': False, 'error_code': '403_forbidden'}
{'url': 'https://vimeo.com/4085044', 'is_parsed': False, 'error_code': '403_forbidden'}
{'url': 'https://vimeo.com/33717000', 'is_parsed': False, 'error_code': '403_forbidden'}
{'url': 'https://vimeo.com/288392170', 'is_parsed': False, 'error_code': '403_forbidden'}
{'url': 'https://vimeo.com/200162057', 'is_parsed': False, 'error_code': '403_forbidden'}
CPU times: user 64.5 ms, sys: 12.3 ms, total: 76.8 ms
Wall time: 5.38 s


In [20]:
%%time
parse_results = list(map(get_vimeo_views_count, vimeo_urls[:5]))

{'url': 'https://vimeo.com/91371852', 'is_parsed': False, 'error_code': '403_forbidden'}
{'url': 'https://vimeo.com/4085044', 'is_parsed': False, 'error_code': '403_forbidden'}
{'url': 'https://vimeo.com/33717000', 'is_parsed': False, 'error_code': '403_forbidden'}
{'url': 'https://vimeo.com/288392170', 'is_parsed': False, 'error_code': '403_forbidden'}
{'url': 'https://vimeo.com/200162057', 'is_parsed': False, 'error_code': '403_forbidden'}
CPU times: user 66.6 ms, sys: 4.58 ms, total: 71.2 ms
Wall time: 384 ms


In [21]:
%%time
with Pool() as P:
    parse_results = P.map(get_vimeo_views_count, vimeo_urls)

{'url': 'https://vimeo.com/25464003', 'is_parsed': False, 'error_code': '403_forbidden'}{'url': 'https://vimeo.com/59139723', 'is_parsed': False, 'error_code': '403_forbidden'}{'url': 'https://vimeo.com/91371852', 'is_parsed': False, 'error_code': '403_forbidden'}{'url': 'https://vimeo.com/151359255', 'is_parsed': False, 'error_code': '403_forbidden'}{'url': 'https://vimeo.com/39668547', 'is_parsed': False, 'error_code': '403_forbidden'}{'url': 'https://vimeo.com/105155012', 'is_parsed': False, 'error_code': '403_forbidden'}{'url': 'https://vimeo.com/25630712', 'is_parsed': False, 'error_code': '403_forbidden'}{'url': 'https://vimeo.com/100811895', 'is_parsed': False, 'error_code': '403_forbidden'}







{'url': 'https://vimeo.com/172343600', 'is_parsed': False, 'error_code': '403_forbidden'}{'url': 'https://vimeo.com/55760922', 'is_parsed': False, 'error_code': '403_forbidden'}
{'url': 'https://vimeo.com/4085044', 'is_parsed': False, 'error_code': '403_forbidden'}
{'url': 'https://vi

**youtube.com**

In [10]:
youtube_urls = filter_urls(valid_results, 'youtube.com')
youtube_urls[:10]

[]

In [9]:
def get_youtube_views_count(url):
    parse_results = {}
    parse_results['url'] = url
    parse_results['is_parsed'] = False
    
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}
    req = requests.get(url, headers=headers)
    sleep(1)
    
    try:
        req.raise_for_status()
    except HTTPError as e:
        status_code = str(e.response.status_code)
        reason = '_'.join(e.response.reason.lower().split())
        error_code = "_".join([status_code, reason])
        parse_results['error_code'] = error_code
        print(parse_results)
        return parse_results
    
    html_text = req.text
    soup = BeautifulSoup(html_text, 'lxml')
    
    try:
        scripts = soup.find_all('script')
        scripts_str = list(map(lambda x: x.string, scripts))
        count_scripts_str = list(filter(lambda x: x is not None, scripts_str))
        count_scripts_str = list(filter(lambda x: 'viewCount' in x, count_scripts_str))
        
        pattern = re.compile(r'viewCount[\\]{,1}":[\\]{,1}"(\d+)')
        
        views_count = re.findall(pattern, count_scripts_str[0])[0]
    except (AttributeError, IndexError) as e:
        parse_results['error_code'] = f'element_not_found'
        print(parse_results)
        return parse_results
    
    parse_results['is_parsed'] = True
    parse_results['raw_views_count'] = views_count
    parse_results['views_count'] = clean_views_count(views_count)
    
    print(parse_results)
    
    return parse_results

In [10]:
%%time
for url in youtube_urls[:5]:
    parse_results = get_youtube_views_count(url)

{'url': 'https://www.youtube.com/watch?v=vvdLLbhxwDA', 'is_parsed': True, 'raw_views_count': '223440', 'views_count': 223440}
{'url': 'https://www.youtube.com/watch?v=l5aw6LHt9iI', 'is_parsed': True, 'raw_views_count': '349513', 'views_count': 349513}
{'url': 'https://www.youtube.com/watch?v=U4kyT-wwEi0', 'is_parsed': True, 'raw_views_count': '367990', 'views_count': 367990}
{'url': 'https://www.youtube.com/watch?v=-UuXFqXp7P4', 'is_parsed': True, 'raw_views_count': '54241', 'views_count': 54241}
{'url': 'https://www.youtube.com/watch?v=wW9trOm303g', 'is_parsed': True, 'raw_views_count': '64803', 'views_count': 64803}
CPU times: user 222 ms, sys: 27.2 ms, total: 249 ms
Wall time: 9.27 s


In [13]:
%%time
parse_results = list(map(get_youtube_views_count, youtube_urls[:5]))

{'url': 'https://www.youtube.com/watch?v=vvdLLbhxwDA', 'is_parsed': True, 'raw_views_count': '223440', 'views_count': 223440}
{'url': 'https://www.youtube.com/watch?v=l5aw6LHt9iI', 'is_parsed': True, 'raw_views_count': '349459', 'views_count': 349459}
{'url': 'https://www.youtube.com/watch?v=U4kyT-wwEi0', 'is_parsed': True, 'raw_views_count': '367966', 'views_count': 367966}
{'url': 'https://www.youtube.com/watch?v=-UuXFqXp7P4', 'is_parsed': True, 'raw_views_count': '54241', 'views_count': 54241}
{'url': 'https://www.youtube.com/watch?v=wW9trOm303g', 'is_parsed': True, 'raw_views_count': '64803', 'views_count': 64803}
CPU times: user 240 ms, sys: 0 ns, total: 240 ms
Wall time: 4.3 s


**Common functionality**

In [16]:
def parse_habr_views_count(response):
    html_text = response.text
    soup = BeautifulSoup(html_text, 'lxml')
    views_count = soup.find('span', class_="post-stats__views-count").text
    return views_count

In [32]:
def prepare_pikabu_request(url, headers):
    story_id = url.split('_')[-1]
    stat_url = f'https://d.pikabu.ru/stat/story/{story_id}'
    return stat_url, headers

In [17]:
def parse_pikabu_views_count(response):
    stats_data = response.json()
    views_count = stats_data['data']['v']
    return views_count

In [33]:
def parse_pornhub_views_count(response):
    html_text = response.text
    soup = BeautifulSoup(html_text, 'lxml')
    views_count = soup.find('span', class_="count").text
    return views_count

In [50]:
def parse_rutube_views_count(response):
    html_text = response.text
    soup = BeautifulSoup(html_text, 'lxml')
    views_count = soup.find('span', class_="video-info-card__view-count").text
    return views_count

In [51]:
def prepare_vimeo_request(url, headers):
    stat_url = f'{url}?action=load_stat_counts'
    headers['x-requested-with'] = 'XMLHttpRequest'
    return stat_url, headers

In [53]:
def parse_vimeo_views_count(response):
    json_data = response.json()
    views_count = json_data['total_plays']['raw']
    return views_count

In [91]:
def parse_youtube_views_count(response):
    html_text = response.text
    soup = BeautifulSoup(html_text, 'lxml')
    
    scripts = soup.find_all('script')
    scripts_str = list(map(lambda x: x.string, scripts))
    count_scripts_str = list(filter(lambda x: x is not None, scripts_str))
    count_scripts_str = list(filter(lambda x: 'viewCount' in x, count_scripts_str))
    
    pattern = re.compile(r'viewCount[\\]{,1}":[\\]{,1}"(\d+)')
    views_count = re.findall(pattern, count_scripts_str[0])[0]
    return views_count

In [92]:
DOMAIN_PARSERS = {
    'habr.com': {
        'parse': parse_habr_views_count,
    },
    'pikabu.ru': {
        'parse': parse_pikabu_views_count,
        'prepare_request': prepare_pikabu_request
    },
    'pornhub.com': {
        'parse': parse_pornhub_views_count
    },
    'rutube.ru': {
        'parse': parse_rutube_views_count
    },
    'vimeo.com': {
        'parse': parse_vimeo_views_count,
        'prepare_request': prepare_vimeo_request
    },
    'youtube.com': {
        'parse': parse_youtube_views_count
    }
}

DOMAINS = list(DOMAIN_PARSERS.keys())

In [93]:
def parse_views_count(response, domain):
    return DOMAIN_PARSERS[domain]['parse'](response)

In [94]:
def get_domain(url, levels=2):
    netloc = urlparse(url).netloc
    domain = '.'.join(netloc.split('.')[-levels:]).lower()
    return domain

In [95]:
def get_views_count(url, sleep_time=1):
    parse_results = {}
    parse_results['url'] = url
    parse_results['is_parsed'] = False
    
    domain = get_domain(url)
    prepare_request = DOMAIN_PARSERS[domain].get('prepare_request')
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}
    
    if prepare_request is not None:
        url, headers = prepare_request(url, headers)
        
    response = requests.get(url, headers=headers)
    sleep(sleep_time)
    
    try:
        response.raise_for_status()
    except HTTPError as e:
        status_code = str(e.response.status_code)
        reason = '_'.join(e.response.reason.lower().split())
        error_code = "_".join([status_code, reason])
        parse_results['error_code'] = error_code
        print(parse_results)
        return parse_results
    
    try:
        views_count = parse_views_count(response, domain)
    except (AttributeError, IndexError, KeyError) as e:
        parse_results['error_code'] = f'element_not_found'
        print(parse_results)
        return parse_results
    
    parse_results['is_parsed'] = True
    parse_results['raw_views_count'] = views_count
    parse_results['views_count'] = clean_views_count(views_count)
    print(parse_results)
    
    return parse_results

In [83]:
DOMAINS = [
    'habr.com',
    'pikabu.ru',
    'rt.pornhub.com',
    'rutube.ru',
    'vimeo.com',
    'www.youtube.com'
]

In [96]:
def get_domain_counts(valid_results, domain):
    domain_urls = filter_urls(valid_results, domain)
    return list(map(get_views_count, domain_urls))

In [97]:
args = [(valid_results[:10], domain) for domain in DOMAINS]
cpu_count = len(DOMAINS)

with Pool(cpu_count) as p:
    domain_results = p.starmap(get_domain_counts, args)
    
#domain_results

{'url': 'https://vimeo.com/91371852', 'is_parsed': False, 'error_code': '403_forbidden'}
{'url': 'https://habr.com/ru/company/wrike/blog/506928/', 'is_parsed': True, 'raw_views_count': '532', 'views_count': 532}
{'url': 'https://www.youtube.com/watch?v=vvdLLbhxwDA', 'is_parsed': True, 'raw_views_count': '223440', 'views_count': 223440}
{'url': 'https://vimeo.com/4085044', 'is_parsed': False, 'error_code': '403_forbidden'}
{'url': 'https://habr.com/ru/post/193844/', 'is_parsed': False, 'error_code': '404_not_found'}
{'url': 'https://vimeo.com/33717000', 'is_parsed': False, 'error_code': '403_forbidden'}
{'url': 'https://www.youtube.com/watch?v=l5aw6LHt9iI', 'is_parsed': True, 'raw_views_count': '349529', 'views_count': 349529}
{'url': 'https://www.youtube.com/watch?v=U4kyT-wwEi0', 'is_parsed': True, 'raw_views_count': '367990', 'views_count': 367990}
{'url': 'https://www.youtube.com/watch?v=-UuXFqXp7P4', 'is_parsed': True, 'raw_views_count': '54241', 'views_count': 54241}
{'url': 'https

In [98]:
from itertools import chain

parse_results = list(chain(*domain_results))
parse_results

[{'url': 'https://habr.com/ru/company/wrike/blog/506928/',
  'is_parsed': True,
  'raw_views_count': '532',
  'views_count': 532},
 {'url': 'https://habr.com/ru/post/193844/',
  'is_parsed': False,
  'error_code': '404_not_found'},
 {'url': 'https://vimeo.com/91371852',
  'is_parsed': False,
  'error_code': '403_forbidden'},
 {'url': 'https://vimeo.com/4085044',
  'is_parsed': False,
  'error_code': '403_forbidden'},
 {'url': 'https://vimeo.com/33717000',
  'is_parsed': False,
  'error_code': '403_forbidden'},
 {'url': 'https://www.youtube.com/watch?v=vvdLLbhxwDA',
  'is_parsed': True,
  'raw_views_count': '223440',
  'views_count': 223440},
 {'url': 'https://www.youtube.com/watch?v=l5aw6LHt9iI',
  'is_parsed': True,
  'raw_views_count': '349529',
  'views_count': 349529},
 {'url': 'https://www.youtube.com/watch?v=U4kyT-wwEi0',
  'is_parsed': True,
  'raw_views_count': '367990',
  'views_count': 367990},
 {'url': 'https://www.youtube.com/watch?v=-UuXFqXp7P4',
  'is_parsed': True,
  'ra