# Мерджер

In [1]:
from tqdm.notebook import tqdm
import glob
import json
import pandas as pd

In [2]:
def merge_jsons(sourcepath='../data/vk_19_04/*.json', savepath='../data/vk-api-saved-all-19-04.json'):
    files = glob.glob(sourcepath)

    def fillFalse(x):
        return None if isinstance(x, bool) else x
    
    print('Number of crawled user pages: {}'.format(len(files)))

    raw = []

    for fpath in tqdm(files):
        with open(fpath) as f:
            content = json.load(f)
            if 'execute_errors' not in content:
                content['execute_errors'] = [dict(method='users.get', error_code=None, error_msg=None),
                                             dict(method='friends.get', error_code=None, error_msg=None),
                                             dict(method='users.getFollowers', error_code=None, error_msg=None),
                                             dict(method='groups.get', error_code=None, error_msg=None),
                                             dict(method='wall.get', error_code=None, error_msg=None)]
            content['response'] = dict(user=content['response'][0], 
                                       friends=fillFalse(content['response'][1]),
                                       followers=fillFalse(content['response'][2]),
                                       groups=fillFalse(content['response'][3]),
                                       wall=fillFalse(content['response'][4]))
            raw.append(content)
            
    df = pd.DataFrame(raw)
    df.to_json(savepath, force_ascii=False, orient='records', lines=True)
    return df

In [3]:
data = merge_jsons()

Number of crawled user pages: 5374


HBox(children=(FloatProgress(value=0.0, max=5374.0), HTML(value='')))




__________________________

In [4]:
errors = pd.DataFrame(data.execute_errors.explode().to_list())

In [9]:
sum(~data.execute_errors.apply(lambda x : any([_['error_code']==29 for _ in x])))

2599

In [6]:
sum(data.response.apply(lambda x : all([len(x[_])>0 if x[_] is not None else False for _ in x])))

1903

In [7]:
data.response

0       {'user': [{'id': 71210327, 'first_name': 'Sani...
1       {'user': [{'id': 948623, 'first_name': 'Yana',...
2       {'user': [{'id': 1600615, 'first_name': 'Pavel...
3       {'user': [{'id': 224008785, 'first_name': 'Dmi...
4       {'user': [{'id': 93123, 'first_name': 'Maria',...
                              ...                        
5369    {'user': [{'id': 164668, 'first_name': 'Natali...
5370    {'user': [{'id': 809543, 'first_name': 'Yulia'...
5371    {'user': [{'id': 3826238, 'first_name': 'Alina...
5372    {'user': [{'id': 1138583, 'first_name': 'Stani...
5373    {'user': [{'id': 4532345, 'first_name': 'Elena...
Name: response, Length: 5374, dtype: object

In [8]:
data.response.iloc[0]

{'user': [{'id': 71210327,
   'first_name': 'Saniok',
   'last_name': 'Protsenko',
   'is_closed': False,
   'can_access_closed': True,
   'sex': 2,
   'domain': 'he2inspire',
   'photo_50': 'https://sun9-12.userapi.com/c841439/v841439265/64249/GhEaMP0reKU.jpg?ava=1',
   'photo_200_orig': 'https://sun9-24.userapi.com/c841439/v841439265/64246/RP-PndPIczw.jpg?ava=1',
   'photo_id': '71210327_456239480',
   'has_photo': 1,
   'status': 'Твори рай',
   'last_seen': {'time': 1583598362, 'platform': 1},
   'verified': 0,
   'can_be_invited_group': False,
   'followers_count': 750,
   'common_count': 0}],
 'friends': None,
 'followers': {'count': 750,
  'items': [393823684,
   428597402,
   24976216,
   34414708,
   415771233,
   393944072,
   273059664,
   408032217,
   67687903,
   261212083,
   285077212,
   335377974,
   366746831,
   220990570,
   359520059,
   159288439,
   244593403,
   364454473,
   197190028,
   311209168,
   231451404,
   194621866,
   310152229,
   303055584,
   33