In [23]:
import json
from pathlib import Path
from urllib.parse import urlparse

from tqdm import tqdm
import requests
from bs4 import BeautifulSoup

import pandas as pd

In [24]:
pd.set_option('max_colwidth', 100) # 50 by default

### Get results files

In [25]:
VALIDATION_RESULTS_FILE = '/home/jupyter/data/url_validation_results.json'
valid_results = []

with open(VALIDATION_RESULTS_FILE) as f:
    for line in f:
        valid_results.append(json.loads(line))
    
print(valid_results[:10])

[{'url': 'https://habr.com/ru/company/wrike/blog/506928/', 'is_valid': True, 'domain': 'habr.com'}, {'url': 'https://vimeo.com/91371852', 'is_valid': True, 'domain': 'vimeo.com'}, {'url': 'https://www.youtube.com/watch?v=vvdLLbhxwDA', 'is_valid': True, 'domain': 'youtube.com'}, {'url': 'https://www.youtube.com/watch?v=l5aw6LHt9iI', 'is_valid': True, 'domain': 'youtube.com'}, {'url': 'https://habr.com/ru/post/193844/', 'is_valid': True, 'domain': 'habr.com'}, {'url': 'https://vimeo.com/4085044', 'is_valid': True, 'domain': 'vimeo.com'}, {'url': 'https://www.youtube.com/watch?v=U4kyT-wwEi0', 'is_valid': True, 'domain': 'youtube.com'}, {'url': 'https://www.youtube.com/watch?v=-UuXFqXp7P4', 'is_valid': True, 'domain': 'youtube.com'}, {'url': 'https://vimeo.com/33717000', 'is_valid': True, 'domain': 'vimeo.com'}, {'url': 'https://www.youtube.com/watch?v=wW9trOm303g', 'is_valid': True, 'domain': 'youtube.com'}]


In [26]:
PARSE_RESULTS_FILE = '/home/jupyter/data/url_parse_results.json'
parse_results = []

with open(PARSE_RESULTS_FILE) as f:
    for line in f:
        parse_results.append(json.loads(line))
    
print(parse_results[:10])

[{'url': 'https://habr.com/ru/post/506756/', 'is_parsed': True, 'raw_views_count': '1,9k', 'views_count': 1900}, {'url': 'https://habr.com/ru/post/500754/', 'is_parsed': True, 'raw_views_count': '163k', 'views_count': 163000}, {'url': 'https://habr.com/ru/post/489254/', 'is_parsed': True, 'raw_views_count': '112k', 'views_count': 112000}, {'url': 'https://habr.com/ru/post/417827/', 'is_parsed': True, 'raw_views_count': '7k', 'views_count': 7000}, {'url': 'https://habr.com/ru/post/507236/', 'is_parsed': True, 'raw_views_count': '2,4k', 'views_count': 2400}, {'url': 'https://habr.com/ru/post/194766/', 'is_parsed': False, 'error_code': '403_forbidden'}, {'url': 'https://habr.com/ru/company/JetBrains/blog/478866/', 'is_parsed': True, 'raw_views_count': '65,3k', 'views_count': 65300}, {'url': 'https://habr.com/ru/company/mailru/blog/490790/', 'is_parsed': True, 'raw_views_count': '50,7k', 'views_count': 50700}, {'url': 'https://habr.com/ru/company/npoechelon/blog/506940/', 'is_parsed': True

In [27]:
valid_df = pd.DataFrame(valid_results)
valid_df

Unnamed: 0,url,is_valid,domain,error_code
0,https://habr.com/ru/company/wrike/blog/506928/,True,habr.com,
1,https://vimeo.com/91371852,True,vimeo.com,
2,https://www.youtube.com/watch?v=vvdLLbhxwDA,True,youtube.com,
3,https://www.youtube.com/watch?v=l5aw6LHt9iI,True,youtube.com,
4,https://habr.com/ru/post/193844/,True,habr.com,
...,...,...,...,...
2415,https://rt.pornhub.com/view_video.php?viewkey=1266029882,True,pornhub.com,
2416,https://rutube.ru/video/30b4a32e310195d116f5dfc964c1a000/,True,rutube.ru,
2417,https://habr.com/ru/post/506464/,True,habr.com,
2418,https://pikabu.ru/story/pro_beshenuyu_ku_na_bmv_6219595,True,pikabu.ru,


In [28]:
parse_df = pd.DataFrame(parse_results)
parse_df

Unnamed: 0,url,is_parsed,raw_views_count,views_count,error_code
0,https://habr.com/ru/post/506756/,True,"1,9k",1900.0,
1,https://habr.com/ru/post/500754/,True,163k,163000.0,
2,https://habr.com/ru/post/489254/,True,112k,112000.0,
3,https://habr.com/ru/post/417827/,True,7k,7000.0,
4,https://habr.com/ru/post/507236/,True,"2,4k",2400.0,
...,...,...,...,...,...
2182,https://www.youtube.com/watch?v=MMV9h3S4jKg&t=818s,True,300913,300913.0,
2183,https://www.youtube.com/watch?v=FExx1Oum_OA,True,288825,288825.0,
2184,https://www.youtube.com/watch?v=koUkTQge_KQ,True,330406,330406.0,
2185,https://www.youtube.com/watch?v=4YKLBdgY7nQ,True,340498,340498.0,


In [29]:
len(valid_df), valid_df['url'].nunique(), valid_df.query('is_valid == True')['url'].nunique()

(2420, 2262, 2187)

In [30]:
len(parse_df), parse_df['url'].nunique(), parse_df.query('is_parsed == True')['url'].nunique()

(2187, 2187, 1740)

In [31]:
2420 - 1740 - 280

400

### Prepare statistics

In [32]:
from collections import defaultdict
from operator import itemgetter
from itertools import groupby

In [33]:
report_stats = []

for row_num, valid_result in enumerate(valid_results):
    stat = {}
    stat['row_num'] = row_num
    stat['url'] = valid_result['url']
    
    if not valid_result['is_valid']:
        value = valid_result['error_code']
        stat['is_parsed'] = False
    else:
        parse_result = list(filter(lambda x: x['url'] == stat['url'], 
                                   parse_results))[0]
        
        stat['is_parsed'] = parse_result['is_parsed']
        
        if not parse_result['is_parsed']:
            value = parse_result['error_code']
        else:
            value = parse_result['views_count']
            
    stat['value'] = value
    report_stats.append(stat)
    
print(len(report_stats))
report_stats[:5]

2420


[{'row_num': 0,
  'url': 'https://habr.com/ru/company/wrike/blog/506928/',
  'is_parsed': True,
  'value': 533},
 {'row_num': 1,
  'url': 'https://vimeo.com/91371852',
  'is_parsed': False,
  'value': '403_forbidden'},
 {'row_num': 2,
  'url': 'https://www.youtube.com/watch?v=vvdLLbhxwDA',
  'is_parsed': True,
  'value': 223526},
 {'row_num': 3,
  'url': 'https://www.youtube.com/watch?v=l5aw6LHt9iI',
  'is_parsed': True,
  'value': 349529},
 {'row_num': 4,
  'url': 'https://habr.com/ru/post/193844/',
  'is_parsed': False,
  'value': '404_not_found'}]

In [73]:
parsed_urls = list(filter(lambda x: x['is_parsed'] is True, report_stats))

get_value = itemgetter('value')
error_urls = list(filter(lambda x: x['is_parsed'] is False, report_stats))
errors = sorted(map(get_value, error_urls))

In [35]:
errors_stats = {key: len(list(group)) for key, group in groupby(errors)}
errors_stats

{'403_forbidden': 282,
 '404_not_found': 59,
 'element_not_found': 129,
 'incorrect_url': 28,
 'wrong_domain': 49}

In [82]:
from csv import DictWriter

with open('/home/jupyter/data/urls_with_errors.csv', 'w') as f:
    fieldnames = ['row_num', 'url', 'value']
    writer = DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
    writer.writeheader()
    
    for line in error_urls:
        writer.writerow(line)

In [83]:
!head /home/jupyter/data/urls_with_errors.csv

row_num,url,value
1,https://vimeo.com/91371852,403_forbidden
4,https://habr.com/ru/post/193844/,404_not_found
5,https://vimeo.com/4085044,403_forbidden
8,https://vimeo.com/33717000,403_forbidden
12,https://vimeo.com/288392170,403_forbidden
14,https://vimeo.com/200162057,403_forbidden
16,https://vimeo.com/259970807,403_forbidden
20,https://vimeo.com/54521591,403_forbidden
21,https://vimeo.com/288392170,403_forbidden


In [63]:
errors_stats_message = '\n'.join([f'{error}: {count}' for (error, count) in errors_stats.items()])
errors_stats_message

report_message = f'''**Processing results**
Processed URLs count: {len(report_stats)}
Parsed URLs count: {len(parsed_urls)}

Errors
{errors_stats_message}
'''

print(report_message)

**Processing results**
Processed URLs count: 2420
Parsed URLs count: 1873

Errors
403_forbidden: 282
404_not_found: 59
element_not_found: 129
incorrect_url: 28
wrong_domain: 49



In [64]:
get_ordered_values = itemgetter('row_num', 'value')
ordered_errors = list(map(get_ordered_values, error_urls))
ordered_errors = sorted(ordered_errors, key=lambda x: x[1])

detailed_errors_info = {
    key: list(g[0] for g in group)
    for key, group in groupby(ordered_errors, key=lambda x: x[1])
}
print(detailed_errors_info)

{'403_forbidden': [1, 5, 8, 12, 14, 16, 20, 21, 22, 33, 35, 37, 42, 43, 48, 53, 56, 63, 64, 66, 70, 71, 74, 76, 82, 83, 85, 92, 93, 96, 97, 101, 103, 104, 105, 106, 107, 109, 110, 112, 115, 116, 117, 118, 119, 123, 125, 126, 127, 129, 131, 132, 134, 136, 138, 139, 140, 142, 146, 147, 151, 153, 155, 160, 167, 168, 173, 175, 183, 189, 190, 191, 192, 194, 200, 203, 206, 207, 211, 214, 215, 216, 220, 222, 223, 228, 229, 232, 235, 240, 241, 244, 245, 249, 250, 254, 255, 256, 257, 260, 262, 264, 265, 268, 269, 271, 272, 275, 277, 278, 283, 285, 286, 288, 289, 295, 296, 297, 301, 306, 311, 325, 327, 329, 337, 342, 346, 347, 348, 349, 353, 356, 363, 366, 373, 374, 379, 380, 381, 386, 388, 389, 396, 400, 406, 407, 409, 410, 412, 413, 414, 417, 420, 422, 425, 432, 439, 442, 444, 445, 446, 447, 448, 451, 453, 455, 458, 459, 463, 466, 468, 476, 481, 482, 485, 486, 489, 492, 494, 495, 497, 500, 502, 503, 509, 510, 516, 523, 527, 529, 533, 537, 540, 542, 544, 545, 551, 552, 555, 557, 560, 561, 570, 

In [68]:
PROCESSING_STATS_FILE = '/home/jupyter/data/url_processing_stats.json'

report = {}

with open(PROCESSING_STATS_FILE) as f:
    report = json.loads(f.read())
    
print(report['summary'])
print(report['detailed'])

**Processing results**
Processed URLs count: 2420
Parsed URLs count: 1873
Errors
403_forbidden: 282
404_not_found: 59
element_not_found: 129
incorrect_url: 28
wrong_domain: 49
{'403_forbidden': [1, 5, 8, 12, 14, 16, 20, 21, 22, 33, 35, 37, 42, 43, 48, 53, 56, 63, 64, 66, 70, 71, 74, 76, 82, 83, 85, 92, 93, 96, 97, 101, 103, 104, 105, 106, 107, 109, 110, 112, 115, 116, 117, 118, 119, 123, 125, 126, 127, 129, 131, 132, 134, 136, 138, 139, 140, 142, 146, 147, 151, 153, 155, 160, 167, 168, 173, 175, 183, 189, 190, 191, 192, 194, 200, 203, 206, 207, 211, 214, 215, 216, 220, 222, 223, 228, 229, 232, 235, 240, 241, 244, 245, 249, 250, 254, 255, 256, 257, 260, 262, 264, 265, 268, 269, 271, 272, 275, 277, 278, 283, 285, 286, 288, 289, 295, 296, 297, 301, 306, 311, 325, 327, 329, 337, 342, 346, 347, 348, 349, 353, 356, 363, 366, 373, 374, 379, 380, 381, 386, 388, 389, 396, 400, 406, 407, 409, 410, 412, 413, 414, 417, 420, 422, 425, 432, 439, 442, 444, 445, 446, 447, 448, 451, 453, 455, 458, 459,