In [1]:
import os
import pandas as pd
import numpy as np
import ast

In [2]:
importantCols = ['date', 'content', 'id', 'likeCount']
columnsToRetrieve = ['username', 'id', 'description', 'followersCount', 'friendsCount', 'statusesCount',
                    'favouritesCount', 'listedCount', 'mediaCount', 'location']

totalCols = importantCols + columnsToRetrieve 

In [3]:
def getUserInfo(dictionaryString, columnsToRetrieve = columnsToRetrieve):
    
    '''
    Function that will get a string dictionary of user information and will map it to a dictionary
    '''
    
    columnsToRetrieve = columnsToRetrieve.copy()
    data = []
    try:
        user_data = ast.literal_eval(dictionaryString)
    except:
        try:
            user_data = dict(dictionaryString)
        except Exception as e:
            print(e)
            raise Exception ('not possible to load the dictionary')

    for idx, key in enumerate(columnsToRetrieve):
        data.append( user_data[key])

    if 'id' in columnsToRetrieve:
        idx = columnsToRetrieve.index('id')
        columnsToRetrieve[idx] = 'user_id'

    return dict(zip(columnsToRetrieve, data))

In [4]:
%%time

fetterman = pd.DataFrame()

directory = 'Data/'
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if os.path.isfile(f) and filename.startswith('Fet'):
        fetterman_incremental_file = pd.read_json(f, lines=True)[['date', 'content','id', 'lang', 'likeCount', 'user']]
        fetterman = pd.concat([fetterman, fetterman_incremental_file], axis=0)
    
fetterman = fetterman.drop_duplicates(subset=['id'])
fetterman = fetterman[fetterman['lang'] =='en']
fetterman = fetterman[fetterman["content"].str.lower().str.contains("oz")==False]
fetterman = fetterman.reset_index(drop=True)

fetterman['test'] = fetterman['user'].apply(getUserInfo)
fetterman = pd.concat([fetterman.drop(['test'], axis=1), fetterman['test'].apply(pd.Series)], axis=1)

fetterman = fetterman[totalCols]

fetterman.to_csv('fettermanRaw.csv', index = False)

Wall time: 1min 5s


In [5]:
%%time

oz = pd.DataFrame()
directory = 'Data/'
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if os.path.isfile(f) and (filename.startswith('Mehmet') or filename.startswith('Dr')):
        oz_incremental_file = pd.read_json(f, lines=True)[['date', 'content','id', 'lang', 'likeCount', 'user']]
        oz = pd.concat([oz, oz_incremental_file], axis=0)
    
oz = oz.drop_duplicates(subset=['id'])
oz = oz[oz['lang'] =='en']
oz = oz[oz["content"].str.lower().str.contains("fetterman")==False]
oz = oz.reset_index(drop=True)

oz['test'] = oz['user'].apply(getUserInfo)
oz = pd.concat([oz.drop(['test'], axis=1), oz['test'].apply(pd.Series)], axis=1)

oz = oz[totalCols]

oz.to_csv('OzRaw.csv', index = False)

Wall time: 4min 53s
