In [122]:
import datetime
import os
import re
import json
from collections import Counter
from pathlib import Path


import pandas as pd
import numpy as np


In [12]:
data_path = Path("../data/mar_12_2015-apr_4_2019")

channels = os.listdir(data_path)
channels_counts = Counter({
    c: len(os.listdir(data_path / c)) for c in channels
    if not c.endswith(".json")
})
channels_counts.most_common(10)

[('_random_talks', 1448),
 ('_meetings', 1350),
 ('theory_and_practice', 1322),
 ('deep_learning', 1320),
 ('lang_python', 1305),
 ('interesting_links', 1289),
 ('_jobs', 1273),
 ('edu_courses', 1177),
 ('kaggle_crackers', 1155),
 ('nlp', 1155)]

# Jobs analytics


In [24]:
jobs_data_path = data_path / "_top_jobs"

dates = [datetime.datetime.strptime(d.split('.')[0], "%Y-%m-%d") 
         for d in os.listdir(jobs_data_path)]

In [25]:
# Top years
Counter((d.year for d in dates))


Counter({2019: 8, 2018: 11})

In [26]:
# Top months
Counter((d.month for d in dates)).most_common(12)



[(3, 4), (12, 3), (2, 3), (8, 3), (11, 2), (1, 1), (9, 1), (4, 1), (7, 1)]

In [31]:
# select vacancies from 2019

files_to_use = [f"{d.strftime('%Y-%m-%d')}.json" 
                    for d in dates if d.year == 2019]

In [32]:
len(files_to_use)

8

In [109]:
def extract_salary_from_top_job(data: dict, city=":spb:"):
    rlist = []
    for line in data["text"].split("\n"):
        if city in line:
            if ":fork" in line:
                rlist.append((line,
                              re.findall("\*[A-Za-zа-яА-Я ]+\* by", line),
                              [*map(int, re.findall(r"[0-9]+", line.split(":fork:")[1]))]))
            
    return rlist



In [34]:
with open(jobs_data_path / files_to_use[0]) as f:
    job_dict = json.load(f)

In [41]:
job_dict[0]["text"].split("\n")

[':cryptoparrot: *ODS дайджест вакансий:* 1 января — 16 января 2019 :moneyparrot:',
 '',
 '*iPavlov*, Москва :default-city: : *Python программист* by <@U8EPE4WG6> :fork: *125-165k*',
 '*Задачи:* Создавать решения для конечных пользователей, адаптировать существующие алгоритмы и разрабатывать новые для решения прикладных задач',
 '*Требования:*  :python: :neuralnet:, плюсом :tensorflow: :nlp: контрибьюция в open-source проекты',
 '<https://opendatascience.slack.com/archives/C04DA5FUF/p1546213578476000>',
 '',
 '*FaceChain*, Санкт-Петербург :spb: : *Data Scientist* by <@UD8RBM06S> :fork: *80-200k*',
 '*Задачи:* Детекция и распознование лиц, антиспуффинг',
 '*Требования:*  :python: :pytorch: OpenCV',
 '<https://opendatascience.slack.com/archives/C04DA5FUF/p1547043026689000>',
 '',
 '*Институт развития интернета*, Москва :default-city:  : *Data Scientist* by <@U86E2ECGZ> :fork: *100-120k*',
 '*Задачи:* Проводить различные исследования открытых данных',
 '*Требования:*  Опыт в анализе данны

In [135]:
stats = []
for job_file in files_to_use:
    with open(jobs_data_path / job_file) as f:
        job_dict = json.load(f)
    for d in job_dict:
        stats.extend(extract_salary_from_top_job(d, ":default-city:"))
        stats.extend(extract_salary_from_top_job(d, ":spb:"))


In [136]:
stats

[('*iPavlov*, Москва :default-city: : *Python программист* by <@U8EPE4WG6> :fork: *125-165k*',
  ['*Python программист* by'],
  [125, 165]),
 ('*Институт развития интернета*, Москва :default-city:  : *Data Scientist* by <@U86E2ECGZ> :fork: *100-120k*',
  ['*Data Scientist* by'],
  [100, 120]),
 ('*<http://ivi.ru|ivi.ru>*, Москва :default-city:  : *Deep Learning Engineer* by <@U3W85TQBE> :fork: *140-200k*',
  ['*Deep Learning Engineer* by'],
  [140, 200]),
 ('*C7 Техлаб*, Москва :default-city:  : *MIddle/Senior Machine Learning Engineer* by <@U3TF60E65> :fork: *150-250k/250-350k*',
  [],
  [150, 250, 250, 350]),
 ('*QIWI*, Москва :default-city:  : *Lead Data Scientist* by <@U3JUB9PB9> :fork: *150-210k*',
  ['*Lead Data Scientist* by'],
  [150, 210]),
 ('*Jetlex*, Москва :default-city:: *NLP Data Scientist* by <@UBUCAUEFJ> :fork: *200-260k*',
  ['*NLP Data Scientist* by'],
  [200, 260]),
 ('*Сбербанк*, Москва :default-city:: *Data Scientist* by <@U5GB71SCX> :fork: *120-200k*',
  ['*Data 

In [137]:
include_job_titles = {"Data Scientist", "ML", "Machine Learning", "Middle"}
exclude_job_titles = {"Senior", "Junior", "Lead", "Video", "калибровок"}

filtered_jobs = []

for job in stats:
    exclude = False
    for ejt in exclude_job_titles:
        exclude = ejt in job[0]
        if exclude:
            break
    if exclude:
        continue
    
    filtered_jobs.append(job)

In [138]:
filtered_jobs

[('*iPavlov*, Москва :default-city: : *Python программист* by <@U8EPE4WG6> :fork: *125-165k*',
  ['*Python программист* by'],
  [125, 165]),
 ('*Институт развития интернета*, Москва :default-city:  : *Data Scientist* by <@U86E2ECGZ> :fork: *100-120k*',
  ['*Data Scientist* by'],
  [100, 120]),
 ('*<http://ivi.ru|ivi.ru>*, Москва :default-city:  : *Deep Learning Engineer* by <@U3W85TQBE> :fork: *140-200k*',
  ['*Deep Learning Engineer* by'],
  [140, 200]),
 ('*Jetlex*, Москва :default-city:: *NLP Data Scientist* by <@UBUCAUEFJ> :fork: *200-260k*',
  ['*NLP Data Scientist* by'],
  [200, 260]),
 ('*Сбербанк*, Москва :default-city:: *Data Scientist* by <@U5GB71SCX> :fork: *120-200k*',
  ['*Data Scientist* by'],
  [120, 200]),
 ('*FaceChain*, Санкт-Петербург :spb: : *Data Scientist* by <@UD8RBM06S> :fork: *80-200k*',
  ['*Data Scientist* by'],
  [80, 200]),
 ('*Skyeng* , Москва :default-city: / удалёнка :palm_tree:: *Специалист по анализу видео* by <@U5G25HMS9> :fork: *150-250k*',
  ['*Спец

In [139]:
min_forks = [fj[2][0] for fj in filtered_jobs]
max_forks = [fj[2][1] for fj in filtered_jobs]

In [128]:
# spb only
np.mean(min_forks), np.mean(max_forks)

(121.66666666666667, 231.66666666666666)

In [134]:
# msk only
np.mean(min_forks), np.mean(max_forks)

(135.0, 198.84615384615384)

In [140]:
# msk and spb
np.mean(min_forks), np.mean(max_forks)

(130.78947368421052, 209.21052631578948)