# Мерджер

In [2]:
from tqdm.notebook import tqdm
import glob
import json
import pandas as pd

In [3]:
def merge_jsons(sourcepath='../data/vk_*_0*/*.json', savepath='../data/vk-api-saved-all-30-04.json'):
    files = glob.glob(sourcepath)

    def fillFalse(x):
        return None if isinstance(x, bool) else x
    
    print('Number of crawled user pages: {}'.format(len(files)))

    raw = []

    for fpath in tqdm(files):
        with open(fpath) as f:
            content = json.load(f)
            if 'execute_errors' not in content:
                content['execute_errors'] = [dict(method='users.get', error_code=None, error_msg=None),
                                             dict(method='friends.get', error_code=None, error_msg=None),
                                             dict(method='users.getFollowers', error_code=None, error_msg=None),
                                             dict(method='groups.get', error_code=None, error_msg=None),
                                             dict(method='wall.get', error_code=None, error_msg=None)]
            content['response'] = dict(user=content['response'][0], 
                                       friends=fillFalse(content['response'][1]),
                                       followers=fillFalse(content['response'][2]),
                                       groups=fillFalse(content['response'][3]),
                                       wall=fillFalse(content['response'][4]))
            raw.append(content)
            
    df = pd.DataFrame(raw)
#     df.to_json(savepath, force_ascii=False, orient='records', lines=True)
    return df

In [4]:
data = merge_jsons()

Number of crawled user pages: 72012


HBox(children=(FloatProgress(value=0.0, max=72012.0), HTML(value='')))




__________________________

In [None]:
# from pyspark.sql import SparkSession
# from pyspark.sql import *
# from pyspark.sql.functions import *

# spark = (SparkSession.builder
#          .config('spark.executor.memory', '4g')
#          .config('spark.driver.memory','4g')
#          .appName('local_spark')
#          .getOrCreate()         
#         )
# sc = spark.sparkContext

In [5]:
errors = pd.DataFrame(data.execute_errors.explode().to_list())

In [6]:
sum(~data.execute_errors.apply(lambda x : any([_['error_code']==29 for _ in x])))

7538

In [7]:
sum(data.response.apply(lambda x : all([len(x[_])>0 if x[_] is not None else False for _ in x])))

5338

In [8]:
data.response

0        {'user': [{'id': 4613957, 'first_name': 'Sofya...
1        {'user': [{'id': 217247517, 'first_name': 'Dar...
2        {'user': [{'id': 411430, 'first_name': 'Antoni...
3        {'user': [{'id': 51190869, 'first_name': 'Anya...
4        {'user': [{'id': 4525590, 'first_name': 'Anast...
                               ...                        
72007    {'user': [{'id': 3826238, 'first_name': 'Alina...
72008    {'user': [{'id': 1138583, 'first_name': 'Stani...
72009    {'user': [{'id': 52541497, 'first_name': 'Lena...
72010    {'user': [{'id': 323660, 'first_name': 'Anya',...
72011    {'user': [{'id': 4532345, 'first_name': 'Elena...
Name: response, Length: 72012, dtype: object

In [23]:
def friends_error(x):
    if x is not None:
        return [_ for _ in x if _['method']=='friends.get']
    else:
        return []

In [29]:
sum((data.response.apply(lambda x : x['friends'] is not None)) | \
    (data.execute_errors.apply(friends_error).apply(lambda x : x[0]['error_code']!=29 if len(x)>0 else True)))

7538

In [31]:
valid = (data.response.apply(lambda x : x['friends'] is not None)) | \
    (data.execute_errors.apply(friends_error).apply(lambda x : x[0]['error_code']!=29 if len(x)>0 else True))
savepath='../data/vk-api-saved-all-30-04.json'
data[valid].to_json(savepath, force_ascii=False, orient='records', lines=True)