In [1]:
import json
from pathlib import Path
from urllib.parse import urlparse

from tqdm import tqdm
import requests
from bs4 import BeautifulSoup

import pandas as pd

In [3]:
pd.set_option('max_colwidth', 100) # 50 by default

### Get results files

In [5]:
VALIDATION_RESULTS_FILE = '/home/jupyter/data/url_validation_results.json'
valid_results = []

with open(VALIDATION_RESULTS_FILE) as f:
    for line in f:
        valid_results.append(json.loads(line))
    
print(valid_results[:10])

[{'url': 'https://habr.com/ru/company/wrike/blog/506928/', 'is_valid': True, 'domain': 'habr.com'}, {'url': 'https://vimeo.com/91371852', 'is_valid': True, 'domain': 'vimeo.com'}, {'url': 'https://www.youtube.com/watch?v=vvdLLbhxwDA', 'is_valid': True, 'domain': 'youtube.com'}, {'url': 'https://www.youtube.com/watch?v=l5aw6LHt9iI', 'is_valid': True, 'domain': 'youtube.com'}, {'url': 'https://habr.com/ru/post/193844/', 'is_valid': True, 'domain': 'habr.com'}, {'url': 'https://vimeo.com/4085044', 'is_valid': True, 'domain': 'vimeo.com'}, {'url': 'https://www.youtube.com/watch?v=U4kyT-wwEi0', 'is_valid': True, 'domain': 'youtube.com'}, {'url': 'https://www.youtube.com/watch?v=-UuXFqXp7P4', 'is_valid': True, 'domain': 'youtube.com'}, {'url': 'https://vimeo.com/33717000', 'is_valid': True, 'domain': 'vimeo.com'}, {'url': 'https://www.youtube.com/watch?v=wW9trOm303g', 'is_valid': True, 'domain': 'youtube.com'}]


In [10]:
PARSE_RESULTS_FILE = '/home/jupyter/data/url_parse_results.json'
parse_results = []

with open(PARSE_RESULTS_FILE) as f:
    for line in f:
        parse_results.append(json.loads(line))
    
print(parse_results[:10])

[{'url': 'https://habr.com/ru/company/wrike/blog/506928/', 'is_parsed': True, 'raw_views_count': '532', 'views_count': 532}, {'url': 'https://habr.com/ru/post/193844/', 'is_parsed': False, 'error_code': '404_not_found'}, {'url': 'https://habr.com/ru/post/496612/', 'is_parsed': True, 'raw_views_count': '89,7k', 'views_count': 89700}, {'url': 'https://habr.com/ru/post/481488/', 'is_parsed': True, 'raw_views_count': '47,4k', 'views_count': 47400}, {'url': 'https://habr.com/ru/post/491974/', 'is_parsed': True, 'raw_views_count': '5528k', 'views_count': 5528000}, {'url': 'https://habr.com/ru/company/toshibarus/blog/462185/', 'is_parsed': True, 'raw_views_count': '121k', 'views_count': 121000}, {'url': 'https://habr.com/ru/post/461037/', 'is_parsed': True, 'raw_views_count': '62k', 'views_count': 62000}, {'url': 'https://habr.com/ru/post/450807/', 'is_parsed': False, 'error_code': '404_not_found'}, {'url': 'https://habr.com/ru/company/tuturu/blog/474688/', 'is_parsed': True, 'raw_views_count

In [11]:
valid_df = pd.DataFrame(valid_results)
valid_df

Unnamed: 0,url,is_valid,domain,error_code
0,https://habr.com/ru/company/wrike/blog/506928/,True,habr.com,
1,https://vimeo.com/91371852,True,vimeo.com,
2,https://www.youtube.com/watch?v=vvdLLbhxwDA,True,youtube.com,
3,https://www.youtube.com/watch?v=l5aw6LHt9iI,True,youtube.com,
4,https://habr.com/ru/post/193844/,True,habr.com,
...,...,...,...,...
2415,https://rt.pornhub.com/view_video.php?viewkey=1266029882,True,pornhub.com,
2416,https://rutube.ru/video/30b4a32e310195d116f5dfc964c1a000/,True,rutube.ru,
2417,https://habr.com/ru/post/506464/,True,habr.com,
2418,https://pikabu.ru/story/pro_beshenuyu_ku_na_bmv_6219595,True,pikabu.ru,


In [12]:
parse_df = pd.DataFrame(parse_results)
parse_df

Unnamed: 0,url,is_parsed,raw_views_count,views_count,error_code
0,https://habr.com/ru/company/wrike/blog/506928/,True,532,532.0,
1,https://habr.com/ru/post/193844/,False,,,404_not_found
2,https://habr.com/ru/post/496612/,True,"89,7k",89700.0,
3,https://habr.com/ru/post/481488/,True,"47,4k",47400.0,
4,https://habr.com/ru/post/491974/,True,5528k,5528000.0,
...,...,...,...,...,...
2338,https://www.youtube.com/watch?v=1k5f5rgyzsE,True,320133,320133.0,
2339,https://www.youtube.com/watch?v=_yolSUYe6B0,True,49332,49332.0,
2340,https://www.youtube.com/watch?v=T7TcQLwCyWM,True,352750,352750.0,
2341,https://www.youtube.com/watch?v=KJEtl2dPgTM,True,261938,261938.0,


In [19]:
len(valid_df), valid_df['url'].nunique(), valid_df.query('is_valid == True')['url'].nunique()

(2420, 2262, 2187)

In [21]:
len(parse_df), parse_df['url'].nunique(), parse_df.query('is_parsed == True')['url'].nunique()

(2343, 2187, 1740)

In [23]:
2420 - 1740 - 280

400

### Prepare statistics

In [32]:
list(map(lambda x: tuple(x.items()), valid_results))

seen = set()
unique_valid_urls = []

for result in valid_results:
    

[(('url', 'https://habr.com/ru/company/wrike/blog/506928/'),
  ('is_valid', True),
  ('domain', 'habr.com')),
 (('url', 'https://vimeo.com/91371852'),
  ('is_valid', True),
  ('domain', 'vimeo.com')),
 (('url', 'https://www.youtube.com/watch?v=vvdLLbhxwDA'),
  ('is_valid', True),
  ('domain', 'youtube.com')),
 (('url', 'https://www.youtube.com/watch?v=l5aw6LHt9iI'),
  ('is_valid', True),
  ('domain', 'youtube.com')),
 (('url', 'https://habr.com/ru/post/193844/'),
  ('is_valid', True),
  ('domain', 'habr.com')),
 (('url', 'https://vimeo.com/4085044'),
  ('is_valid', True),
  ('domain', 'vimeo.com')),
 (('url', 'https://www.youtube.com/watch?v=U4kyT-wwEi0'),
  ('is_valid', True),
  ('domain', 'youtube.com')),
 (('url', 'https://www.youtube.com/watch?v=-UuXFqXp7P4'),
  ('is_valid', True),
  ('domain', 'youtube.com')),
 (('url', 'https://vimeo.com/33717000'),
  ('is_valid', True),
  ('domain', 'vimeo.com')),
 (('url', 'https://www.youtube.com/watch?v=wW9trOm303g'),
  ('is_valid', True),
  