In [1]:
import numpy as np
import pandas as pd
import bz2
import datetime
import json

## Choose the year of the dataset that you want to reduce

In [2]:
YEAR = 2020
INITIAL_DATASET = f"quotes-{str(YEAR)}.json.bz2"
REDUCED_DATASET = f"quotes-{str(YEAR)}-reduced.json.bz2"

## For years 2015, 2016, 2018, 2019 and 2020

First we decided to remove all unknown speakers. Then, we kept only the speaker for which the probability is higher than 0.7. Indeed, we want a high probability to have the real speaker, since our analysis mainly focus on the characteristics of the speakers.

We drop the columns that are not useful in our analysis: 'quoteID', 'date', 'probas', 'urls', 'phase'.

In [3]:
#Reduce a dataset by keeping known speakers with a probability > 0.7
#drop columns ['quoteID', 'date', 'probas', 'urls', 'phase']

path_to_file = '/content/drive/MyDrive/Quotebank/' + INITIAL_DATASET
path_to_out = '/content/drive/MyDrive/data/' + REDUCED_DATASET

if YEAR in [2017, 2018]:
    keys_to_remove = ['quoteID', 'probas', 'urls', 'phase']
else:
    keys_to_remove = ['quoteID', 'date', 'probas', 'urls', 'phase']

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            if (instance['speaker'] != None) and (instance['probas'][0][1] > 0.7):
                for k in keys_to_remove:
                    del instance[k]
                d_file.write((json.dumps(instance)+'\n').encode('utf-8')) 

## For 2017

For 2017, we decided to separate the file into to separate files: one with the quotations before the #MeToo movement, and one with the quotations after the movement.

In [4]:
#For 2017, create 2 separated files: one before #MeToo (October) and one after
path_to_file = '/content/drive/MyDrive/data/quotes-2017-reduced.json.bz2'
path_to_out = '/content/drive/MyDrive/data/quoted-2017-reduced-before.json.bz2'

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            instance_date = datetime.datetime.strptime(instance['date'], "%Y-%m-%d %H:%M:%S")
            if (instance_date.month < 10):
                d_file.write((json.dumps(instance)+'\n').encode('utf-8'))    
                
  

In [5]:
path_to_file = '/content/drive/MyDrive/data/quotes-2017-reduced.json.bz2'
path_to_out = '/content/drive/MyDrive/data/quoted-2017-reduced-after.json.bz2'

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            instance_date = datetime.datetime.strptime(instance['date'], "%Y-%m-%d %H:%M:%S")
            if (instance_date.month >= 10):
                d_file.write((json.dumps(instance)+'\n').encode('utf-8')) 

### Note that the data are now stored into file named "quotes-year-reduced.json.bz2". These files will be used for further preprocessing. See [here](Preprocessing.ipynb).