## Imports

In [1]:

import pandas as pd
import bz2
import json
import math
import datetime

## Datasets

In [2]:
DATA_FOLDER = '../data/'
SPEAKER_ATTRIBUTES_DATA = DATA_FOLDER+"speaker_attributes.parquet"
WIKIDATA_LABELS = DATA_FOLDER + "wikidata_labels_descriptions_quotebank.csv.bz2"

In [3]:
#Read the wikilabels dataframe
wiki_labels = pd.read_csv(WIKIDATA_LABELS, compression='bz2', index_col='QID')

In [4]:
#Read the attribute dataframe
attributes = pd.read_parquet(SPEAKER_ATTRIBUTES_DATA)

## Keep only the attributes of interest and rename the column for the future merge.

In [5]:
attributes = attributes.get(['gender','date_of_birth','occupation','id', 'ethnic_group', 'academic_degree', 'religion'])
attributes.rename(columns={'id':'qids'}, inplace=True)

## Preprocessing to create sample for the first descriptives tasks

In [6]:
def merge_chunk_attributes(chunk):
        """
        Merge the chunk with the .parquet file with the wikidata attribute of the speaker.
        Before doing it, keep only the quotes with a single qid.
    
        Parameters:
        ----------
        chunk: dataframe
    
        Returns:
        ----------
        chunk: the dataframe with readable information
    
        """
        #Remove the quotes with speaker linked to many qids: we cannot know which of the homonym is the real speaker
        chunk.drop(chunk[chunk['qids'].map(len) > 1].index, inplace=True)
        #Keep only the first qids
        chunk['qids'] = chunk['qids'].apply(lambda x: x[0])
        #Merge the chunk with the parquet file
        chunk_merged = chunk.merge(attributes)
        return chunk_merged
    
def match_code_label(code):
    """
        Match the wikidata code 'Q...' with its readable information
    
        Parameters:
        ----------
        code: string
    
        Returns:
        ----------
        string : the readable information
    
    """
    if code == None:
        return None
    
    if code in wiki_labels.index:
        return wiki_labels.loc[code]['Label']
    else:
        return None

def render_info_readable(chunk):
    """
    Replace the wikidata code 'Q...' by readable information. Drop all rows containing a None.
    
    Parameters:
    ----------
    chunk: dataframe
    
    Returns:
    ----------
    chunk: the dataframe with readable information
    
    """
    for col in ['gender', 'occupation', 'ethnic_group', 'academic_degree', 'religion']:
        chunk[col] = chunk[col].apply(lambda x: [match_code_label(i) for i in x] if (x is not None) else (None))

    return chunk


def filter_gender(chunk):
        """
        Keeps only the first gender in the list of genders, and drop the quotations with unknown gender.
    
        Parameters:
        ----------
        chunk: dataframe
    
        Returns:
        ----------
        chunk: the dataframe with only one gender per row.
    
        """  
        #Remove the quotations for which the gender of the speaker in unknown
        chunk.dropna(axis = 0, subset = ['gender'], inplace = True)
        #Keep only the first gender in the list 
        chunk['gender'] = chunk['gender'].apply(lambda x: x[0])
        return chunk


    
def get_age(chunk, actual_year):
    """
    Replace the column 'date_of_birth' by a column age, with the age calculated at the time of the quotation.
    
    Parameters:
    ----------
    chunk: dataframe
    actual_year: year of the dataframe
    
    Returns:
    ----------
    chunk: the dataframe with the age of speakers reported
    
    """
    
    chunk.dropna(axis = 0, subset = ['date_of_birth'], inplace = True)
    chunk['date_of_birth'] = chunk['date_of_birth'].apply(lambda x: x[0]) 
    chunk['date_of_birth'] = chunk['date_of_birth'].apply(lambda x: int(actual_year - float(x[1:5]))) 
    chunk.rename(columns={"date_of_birth": "age"}, inplace = True)
    return chunk

def preprocess_data(year):
    """
    Preprocess the data by:
    -merging the data with the attributes of the speakers
    -rendering the information readable 
    -removing unknown gender
    -calculating age and removing unknown ages

    The function stores the preprocess data in files named "chunk-i-year.json.bz2", with i=1,...n for n 
    the number of chunks.
    
    Parameters:
    ----------
    year: string year of the dataframe to process. Expected values: '2015', '2016', '2017-before', 
    '2017-after', '2018', '2019' or '2020'.
    
    """
    DATA_TO_PROCESS = DATA_FOLDER + f'quotes-{year}-reduced.json.bz2'
    i = 0
    with pd.read_json(DATA_TO_PROCESS, lines=True, compression='bz2', chunksize=1000000) as df_reader:
        for chunk in df_reader:
            i +=1
            chunk_merged = merge_chunk_attributes(chunk)
            chunk_readable = render_info_readable(chunk_merged)
            chunk_filtered = filter_gender(chunk_readable)
            chunk_aged = get_age(chunk_filtered, year)
            if year == '2017':
                chunk_aged.drop(columns = ['date'], inplace = True)
            chunk_aged.to_json(DATA_FOLDER + f'chunk-{str(i)}-{str(year)}.json.bz2')

def create_sample_from_year(year):
    """
    Create a sample for the year by randomly sampling 33333 quotations in the first 3 chunks created by 
    the function preprocess_data. The sample results in 99999 quotations for the given year. This function 
    stores the created sample in a file named "sample-year.json.bz2".
    
    Parameters:
    ----------
    year: string. Year of the dataframe to process. Expected values: '2015', '2016', '2017-before', 
    '2017-after', '2018', '2019' or '2020'.
    
    """
    sample = []
    for k in range(3):
        df_chunk = pd.read_json(DATA_FOLDER + f'chunk-{str(k+1)}-{year}.json.bz2')
        sample.append(df_chunk.sample(math.floor(100000/3)))
    df_sample = pd.concat(sample, ignore_index=True)
    df_sample.to_json(DATA_FOLDER + f'sample-{year}.json.bz2')
    


## Example of how to create a preprocessed sample 

In [7]:
year = '2017-before'
preprocess_data(year)
create_sample_from_year(year)

In [8]:
sample = pd.read_json(DATA_FOLDER + f'sample-{year}.json.bz2')

In [9]:
sample.shape[0]

99999

In [10]:
sample.head()

Unnamed: 0,quotation,speaker,qids,numOccurrences,gender,age,occupation,ethnic_group,academic_degree,religion
0,You can have Meryl Streep acting till she's 90...,Robert Legato,Q8342,1,male,61,"[artist, cinematographer]",,,
1,Danny is like a double child,Danny Dyer,Q712803,2,male,40,"[actor, entrepreneur, sports journalist, stage...",,,
2,"If we ignore the timeline, the minister could ...",John Robinson,Q11310811,1,male,53,[disc jockey],,,
3,The City Council will need to decide if the AS...,Scott Whyte,Q7437503,1,male,39,"[actor, television actor, voice actor]",,,
4,one of the most influential art movements ever...,Jeffrey Deitch,Q11309296,1,male,67,"[curator, art dealer]",,,


## Filtering to keep only the quotations for which all speaker's attributes are known.

In [11]:
def filter_all_attributes(year, nb_chunks):
    """
    Filter all chunks of a given year by removing all quotes containing None values for the attributes of the speakers.
    Concatenate all chunks and store the filtered data of the year if a file "filtered-year.json.bz2"
    
    Parameters:
    ----------
    year: string. Year of the dataframe to process. Expected values: '2015', '2016', '2017-before', 
    '2017-after', '2018', '2019' or '2020'.
    nb_chunks: number of chunk files created earlier with the preprocess_data function for the given year.
    
    """
    chunklist = []
    for k in range(nb_chunks):
        df_chunk = pd.read_json(DATA_FOLDER + f'chunk-{str(k+1)}-{str(year)}.json.bz2')
        for col in ['occupation', 'ethnic_group', 'academic_degree', 'religion']:
            df_chunk.dropna(axis = 0, subset = [col], inplace = True)
        chunklist.append(df_chunk)
    df_year = pd.concat(chunklist, ignore_index=True)
    df_year.to_json(DATA_FOLDER + f'filtered-{year}.json.bz2')
        

## Example of how to filtered an entire year 

In [12]:
year = '2017-before'
nb_chunks = 9

In [13]:
filter_all_attributes(year, nb_chunks)

In [14]:
filtered_year = pd.read_json(DATA_FOLDER + f'filtered-{(year)}.json.bz2')

In [15]:
filtered_year.head()

Unnamed: 0,quotation,speaker,qids,numOccurrences,gender,age,occupation,ethnic_group,academic_degree,religion
0,50 Years of Breaking Barriers and Building Fut...,Maxine Waters,Q461727,1,female,79,"[politician, teacher]",[African Americans],[Bachelor of Arts],[Christianity]
1,"We can do this, you all, but we've got to stan...",Maxine Waters,Q461727,1,female,79,"[politician, teacher]",[African Americans],[Bachelor of Arts],[Christianity]
2,we're in some abnormal times. What impact this...,Maxine Waters,Q461727,1,female,79,"[politician, teacher]",[African Americans],[Bachelor of Arts],[Christianity]
3,would have the wisdom to ask you to stay on,Maxine Waters,Q461727,9,female,79,"[politician, teacher]",[African Americans],[Bachelor of Arts],[Christianity]
4,"According to the Waters aide, the congresswoma...",Maxine Waters,Q461727,1,female,79,"[politician, teacher]",[African Americans],[Bachelor of Arts],[Christianity]
