In [3]:
from pathlib import Path

import gspread
from oauth2client.service_account import ServiceAccountCredentials

import pandas as pd

In [4]:
pd.set_option('max_colwidth', 100) # 50 by default

### Get access to Google Sheets

In [12]:
KEY_PATH = '../keys/airflow_secret.json'

In [13]:
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name(KEY_PATH, scope)
client = gspread.authorize(creds)

In [14]:
TEST_URL = 'https://docs.google.com/spreadsheets/d/1Qxnc9bp8-5M3lSxzVDD817To5OEB3JkMG0XpFznrtD8/edit#gid=0'
# URL = 'https://docs.google.com/spreadsheets/d/1GE-4bLOuBOhMCGF-vGs7j2BTSdsgRmvKvDQhBqMkNuk/edit#gid=0'

In [15]:
sheet = client.open_by_url(TEST_URL).sheet1
sheet

<Worksheet 'Sheet1' id:0>

In [16]:
sheet_data = sheet.get_all_records(head=2)
sheet_data[:3]

[{'ссылка': 'https://habr.com/ru/company/wrike/blog/506928/',
  'Зимка Борис & Смирнов Максим': '',
  'Лабазкин Дмитрий & Хачатрян Екатерина': 'Hello',
  'Максим Ястремский & Кузнецов Дмитрий': '',
  'Юлия Пак': ''},
 {'ссылка': 'https://vimeo.com/91371852',
  'Зимка Борис & Смирнов Максим': '',
  'Лабазкин Дмитрий & Хачатрян Екатерина': '',
  'Максим Ястремский & Кузнецов Дмитрий': '',
  'Юлия Пак': ''},
 {'ссылка': 'https://www.youtube.com/watch?v=vvdLLbhxwDA',
  'Зимка Борис & Смирнов Максим': '',
  'Лабазкин Дмитрий & Хачатрян Екатерина': '',
  'Максим Ястремский & Кузнецов Дмитрий': '',
  'Юлия Пак': ''}]

In [17]:
sheet_df = pd.DataFrame(sheet_data)
sheet_df.columns = ['url', 'team_1', 'team_2', 'team_3', 'team_4']
print(len(sheet_df))
sheet_df.head()

2420


Unnamed: 0,url,team_1,team_2,team_3,team_4
0,https://habr.com/ru/company/wrike/blog/506928/,,Hello,,
1,https://vimeo.com/91371852,,,,
2,https://www.youtube.com/watch?v=vvdLLbhxwDA,,,,
3,https://www.youtube.com/watch?v=l5aw6LHt9iI,,,,
4,https://habr.com/ru/post/193844/,,,,


In [18]:
sheet.update_acell('C3', 'Hello')

{'spreadsheetId': '1Qxnc9bp8-5M3lSxzVDD817To5OEB3JkMG0XpFznrtD8',
 'updatedRange': 'Sheet1!C3',
 'updatedRows': 1,
 'updatedColumns': 1,
 'updatedCells': 1}

In [19]:
sheet.update('C10:C15', [[1], [2], [3], [4], [5], [5]])

{'spreadsheetId': '1Qxnc9bp8-5M3lSxzVDD817To5OEB3JkMG0XpFznrtD8',
 'updatedRange': 'Sheet1!C10:C15',
 'updatedRows': 6,
 'updatedColumns': 1,
 'updatedCells': 6}

### Explore URLs

In [20]:
from urllib.parse import urlparse

In [21]:
sheet_df

Unnamed: 0,url,team_1,team_2,team_3,team_4
0,https://habr.com/ru/company/wrike/blog/506928/,,Hello,,
1,https://vimeo.com/91371852,,,,
2,https://www.youtube.com/watch?v=vvdLLbhxwDA,,,,
3,https://www.youtube.com/watch?v=l5aw6LHt9iI,,,,
4,https://habr.com/ru/post/193844/,,,,
...,...,...,...,...,...
2415,https://rt.pornhub.com/view_video.php?viewkey=1266029882,,,,
2416,https://rutube.ru/video/30b4a32e310195d116f5dfc964c1a000/,,,,
2417,https://habr.com/ru/post/506464/,,,,
2418,https://pikabu.ru/story/pro_beshenuyu_ku_na_bmv_6219595,,,,


In [22]:
urls = sheet_df['url']
urls

0                      https://habr.com/ru/company/wrike/blog/506928/
1                                          https://vimeo.com/91371852
2                         https://www.youtube.com/watch?v=vvdLLbhxwDA
3                         https://www.youtube.com/watch?v=l5aw6LHt9iI
4                                    https://habr.com/ru/post/193844/
                                    ...                              
2415         https://rt.pornhub.com/view_video.php?viewkey=1266029882
2416        https://rutube.ru/video/30b4a32e310195d116f5dfc964c1a000/
2417                                 https://habr.com/ru/post/506464/
2418          https://pikabu.ru/story/pro_beshenuyu_ku_na_bmv_6219595
2419    https://rt.pornhub.com/view_video.php?viewkey=ph5c5861e4ecae2
Name: url, Length: 2420, dtype: object

In [23]:
parsed_urls_df = sheet_df['url'].apply(urlparse).apply(pd.Series)
parsed_urls_df.columns = ['scheme', 'netloc', 'path', 'params', 'query', 'fragment']
parsed_urls_df

Unnamed: 0,scheme,netloc,path,params,query,fragment
0,https,habr.com,/ru/company/wrike/blog/506928/,,,
1,https,vimeo.com,/91371852,,,
2,https,www.youtube.com,/watch,,v=vvdLLbhxwDA,
3,https,www.youtube.com,/watch,,v=l5aw6LHt9iI,
4,https,habr.com,/ru/post/193844/,,,
...,...,...,...,...,...,...
2415,https,rt.pornhub.com,/view_video.php,,viewkey=1266029882,
2416,https,rutube.ru,/video/30b4a32e310195d116f5dfc964c1a000/,,,
2417,https,habr.com,/ru/post/506464/,,,
2418,https,pikabu.ru,/story/pro_beshenuyu_ku_na_bmv_6219595,,,


In [24]:
parsed_urls_df['scheme'].value_counts()

https    2392
            4
hkkps       2
rttps       2
httls       2
hqqps       1
hggps       1
httas       1
mttps       1
hbbps       1
sttps       1
httpd       1
httpf       1
httrs       1
hwwps       1
httcs       1
huups       1
httfs       1
httpu       1
httph       1
yttps       1
httbs       1
hjjps       1
Name: scheme, dtype: int64

In [25]:
site_counts = parsed_urls_df['netloc'].value_counts()
site_counts[:50]

rt.pornhub.com     895
habr.com           431
rutube.ru          294
vimeo.com          280
www.youtube.com    270
pikabu.ru          182
                     4
rt.pvrnhub.cvm       2
mt.pomnhub.com       2
rt.pornhun.com       2
rftfbe.rf            2
rt.pornsub.com       1
rt.parnhub.cam       1
pnkabu.ru            1
rt.pornhub.con       1
rt.pornhub.hom       1
fikabu.ru            1
rk.pornhub.com       1
rltlbe.rl            1
habr.cwm             1
rt.porchub.com       1
likabu.ru            1
rt.pornhub.cop       1
lutube.lu            1
rt.pornhvb.com       1
rt.pornhuf.com       1
ft.pofnhub.com       1
rukube.ru            1
pivabu.ru            1
rt.pornhkb.com       1
rt.pornhub.cok       1
rg.pornhub.com       1
rt.pornhux.com       1
ru.pornhub.com       1
qt.poqnhub.com       1
rwtwbe.rw            1
rw.pornhub.com       1
rt.pornhub.oom       1
rb.pornhub.com       1
rt.pornhub.cor       1
rt.porkhub.com       1
rt.pornhub.aom       1
rt.pornmub.com       1
ut.pounhub.

In [26]:
correct_domains = site_counts[site_counts > 10].sort_index().index.tolist()
correct_domains

['habr.com',
 'pikabu.ru',
 'rt.pornhub.com',
 'rutube.ru',
 'vimeo.com',
 'www.youtube.com']

### Sites Parsing

In [27]:
import requests
from bs4 import BeautifulSoup

**habr.com**

In [28]:
urls[urls.str.contains('habr')]

0          https://habr.com/ru/company/wrike/blog/506928/
4                        https://habr.com/ru/post/193844/
11                       https://habr.com/ru/post/496612/
13                       https://habr.com/ru/post/481488/
18                       https://habr.com/ru/post/491974/
                              ...                        
2396                     https://habr.com/ru/post/506902/
2402                     https://habr.com/ru/post/507124/
2403       https://habr.com/ru/company/vrike/blog/506928/
2406    https://habr.com/ru/company/skilline/blog/506522/
2417                     https://habr.com/ru/post/506464/
Name: url, Length: 434, dtype: object

In [29]:
url = 'https://habr.com/ru/company/wrike/blog/506928/'

In [30]:
html_text = requests.get(url).text
html_text[:15]

'<!DOCTYPE html>'

In [31]:
soup = BeautifulSoup(html_text, 'lxml')
soup.find('span', class_="post-stats__views-count").text

'504'

**pikabu.ru**

In [32]:
urls[urls.str.contains('pikabu')]

807                                                    https://pikabu.ru/story/moderator_vs_zombies_2177316
814                 https://pikabu.ru/story/kak_ya_uznal_ob_izmene_ili_lyogkiy_sposob_brosit_shutit_7471791
816                                       https://pikabu.ru/story/budet_li_otvet_ot_kospleyshchikov_7366977
833     https://pikabu.ru/story/moskovskogo_gaishnika_zastavili_pisat_obyasnitelnuyu__on_ostanovil_zamgl...
836                                                         https://pikabu.ru/story/mashkina_radost_6977120
                                                       ...                                                 
2382                                                                  https://pikabu.ru/story/genyi_6306182
2401                  https://pikabu.ru/story/o_dvulichii_pikabu_na_primere_shashlyichnika_magomeda_5753834
2410    https://pikabu.ru/story/kak_obmanut_na_millionyi_rubley_popast_na_pervyiy_kanal_i_stat_uspeshnyi...
2412                        

In [33]:
# url = 'https://pikabu.ru/story/moderator_vs_zombies_2177316' # without link
url = 'https://pikabu.ru/story/budet_li_otvet_ot_kospleyshchikov_7366977' # with link

headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}

html_text = requests.get(url, headers=headers).text
html_text[:15]

'<!doctype html>'

In [34]:
soup = BeautifulSoup(html_text, 'lxml')
soup.find('div', class_="story__views-count")

In [38]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager 
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.headless = True
chrome_options.add_argument('--no-proxy-server') 
chrome_options.add_argument("--proxy-server='direct://'")
chrome_options.add_argument("--proxy-bypass-list=*")

browser = webdriver.Chrome(ChromeDriverManager().install(),options=chrome_options)

browser.get(url)
print(browser.find_element_by_class_name('story__views-count').text)
browser.quit()