In [1]:
#Imports
import pandas as pd
import bz2
import json
import math
import datetime

In [2]:
#Datasets
DATA_FOLDER = '../data/'
SPEAKER_ATTRIBUTES_DATA = DATA_FOLDER+"speaker_attributes.parquet"
WIKIDATA_LABELS = DATA_FOLDER + "wikidata_labels_descriptions_quotebank.csv.bz2"

In [3]:
#Read the wikilabels dataframe
wiki_labels = pd.read_csv(WIKIDATA_LABELS, compression='bz2', index_col='QID')

In [5]:
#Read the attribute dataframe
attributes = pd.read_parquet(SPEAKER_ATTRIBUTES_DATA)

In [6]:
#Keep only what is of interest in the attributes
attributes = attributes.get(['gender','date_of_birth','occupation','id', 'ethnic_group', 'academic_degree', 'religion'])
attributes.rename(columns={'id':'qids'}, inplace=True)

## TODO: 
Documenter ces fonctions et les mettres dans un fichier .py séparé.

In [7]:
def merge_chunk_attributes(chunk):
        #Keep only the first qids
        chunk['qids'] = chunk['qids'].apply(lambda x: x[0])
        #Merge the chunk with the parquet file
        chunk_merged = chunk.merge(attributes)
        return chunk_merged
        
def filter_gender(chunk):
        #Drop rows with None as gender
        chunk.dropna(axis = 0, subset = ['gender'], inplace = True)
        #Keep only the first gender in the list 
        chunk['gender'] = chunk['gender'].apply(lambda x: x[0])
        return chunk
    
def match_code_label(code):
    if code == None:
        return None
    
    if code in wiki_labels.index:
        return wiki_labels.loc[code]['Label']
    else:
        return None

def render_info_readable(chunk):
    for col in ['gender', 'occupation', 'ethnic_group', 'academic_degree', 'religion']:
        chunk[col] = chunk[col].apply(lambda x: [match_code_label(i) for i in x] if (x is not None) else (None))   
    return chunk
    
def get_age(chunk, actual_year):
    chunk.dropna(axis = 0, subset = ['date_of_birth'], inplace = True)
    chunk['date_of_birth'] = chunk['date_of_birth'].apply(lambda x: x[0]) 
    chunk['date_of_birth'] = chunk['date_of_birth'].apply(lambda x: int(actual_year - float(x[1:5]))) 
    chunk.rename(columns={"date_of_birth": "age"}, inplace = True)
    return chunk

def create_sample_from_year(year):
    sample = []
    for k in range(3):
        df_chunk = pd.read_json(DATA_FOLDER + f'chunk-{str(k+1)}-{str(year)}.json.bz2')
        sample.append(df_chunk.sample(math.floor(100000/3)))
    df_sample = pd.concat(sample, ignore_index=True)
    df_sample.to_json(DATA_FOLDER + f'sample-{str(year)}.json.bz2')
          
def preprocess_data(year):
    DATA_TO_PROCESS = DATA_FOLDER + f'quotes-{str(year)}-reduced.json.bz2'
    i = 0
    with pd.read_json(DATA_TO_PROCESS, lines=True, compression='bz2', chunksize=1000000) as df_reader:
        for chunk in df_reader:
            i +=1
            chunk_merged = merge_chunk_attributes(chunk)
            chunk_readable = render_info_readable(chunk_merged)
            chunk_filtered = filter_gender(chunk_readable)
            chunk_aged = get_age(chunk_filtered, year)
            chunk_filtered.to_json(DATA_FOLDER + f'chunk-{str(i)}-{str(year)}.json.bz2')

In [None]:
year = 2015

In [None]:
preprocess_data(year)

In [None]:
create_sample_from_year(year)

In [None]:
new_sample = pd.read_json(DATA_FOLDER + f'sample-{str(year)}.json.bz2')

In [None]:
new_sample.head()