In [1]:
import pandas as pd
import bz2
import json
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from utils import quotes_by_gender, unique_speaker_per_gender, get_champlex
from collections import Counter
import math

In [2]:
DATA_FOLDER = 'data/'


DATA_2015 = DATA_FOLDER+"filtered-2015.json.bz2"
DATA_2016 = DATA_FOLDER+"filtered-2016.json.bz2"
DATA_2017_BEFORE = DATA_FOLDER + "filtered-2017-before.json.bz2"
DATA_2017_AFTER = DATA_FOLDER + "filtered-2017-after.json.bz2"
DATA_2018 = DATA_FOLDER+"filtered-2018.json.bz2"
DATA_2019 = DATA_FOLDER+"filtered-2019.json.bz2"
DATA_2020 = DATA_FOLDER+"filtered-2020.json.bz2"


In [3]:
def read_all_chunks(year, nb_chunk):
    chunks=[]
    for i in range(nb_chunk):
        chunks.append(pd.read_json(DATA_FOLDER+f'chunk-{str(i+1)}-{year}.json.bz2'))
    return pd.concat(chunks)
        
        

In [76]:
quotes_2020 = read_all_chunks('2020', 3)

In [77]:
quotes_2020.drop(['religion', 'ethnic_group'], axis = 1, inplace = True)
#print(quotes_2020.dropna().shape[0])

In [78]:
print((quotes_2020.dropna()).shape[0])

111794


In [20]:
print('initial number of quotes in 2020:', quotes_2020.shape[0])

initial number of quotes in 2020: 1571363


In [31]:
print('If we drop all NaN, we are left with', quotes_2020.dropna().shape[0], 'quotes.')
print('It means that we loose', quotes_2020.shape[0] - quotes_2020.dropna().shape[0], 'quotes.')

If we drop all NaN, we are left with 33622 quotes.
It means that we loose 1537741 quotes.


In [35]:
print('Number of quotes if we removed unknown ethnic_groups', quotes_2020.dropna(subset = ['ethnic_group']).shape[0])
print('We lost',  quotes_2020.shape[0]-quotes_2020.dropna(subset = ['ethnic_group']).shape[0], 'quotes')

Number of quotes if we removed unknown ethnic_groups 180957
We lost 1390406 quotes


In [24]:
without_ethnic_group = quotes_2020.drop('ethnic_group', axis = 1)

In [27]:
print('If we remove the column ethnic group and drop all NaN, we are left with',
      without_ethnic_group.dropna().shape[0], 'quotes: we loose', quotes_2020.shape[0] - without_ethnic_group.dropna().shape[0], 'quotes')

If we remove the column ethnic group and drop all NaN, we are left with 88966 quotes: we loose 1482397 quotes


In [30]:
without_religion_and_ethnic_group = without_ethnic_group.drop('religion', axis = 1)

In [34]:
print('If we do not take religion neither ethnic group, we have', without_religion_and_ethnic_group.dropna().shape[0], 'quotes without None.')

If we do not take religion neither ethnic group, we have 111794 quotes without None.


I think the best compromise is to: drop `ethnic_group` column because too much None. Then we need to drop one between `religion` and `academic_degree`. I think that in our analysis, what is more relevant is to keep the `academic_degree`.

In [4]:
def create_observational_data(year, nb_chunk):
    chunks=[]
    for i in range(nb_chunk):
        chunk = pd.read_json(DATA_FOLDER+f'chunk-{str(i+1)}-{year}.json.bz2')
        chunk.drop(['religion', 'ethnic_group'], axis = 1, inplace = True)
        chunk.dropna(inplace = True)
        chunks.append(chunk)
        df_obs = pd.concat(chunks)
        df_obs.reset_index(drop=True, inplace=True)
        df_obs.to_json(DATA_FOLDER + f'observational-{year}.json.bz2')

In [5]:
create_observational_data('2018', 13)

In [6]:
create_observational_data('2017-after', 4)

In [7]:
create_observational_data('2017-before', 9)

In [5]:
quotes_2018 = pd.read_json(DATA_FOLDER+f'observational-2018.json.bz2')
quotes_2020 = pd.read_json(DATA_FOLDER+f'observational-2020.json.bz2')

In [7]:
quotes_2020 = pd.read_json(DATA_FOLDER+f'observational-2020.json.bz2')

In [10]:
quotes_2020

Unnamed: 0,quotation,speaker,qids,numOccurrences,gender,age,occupation,academic_degree
11,[ These ] actions will allow households who ha...,Ben Carson,Q816459,1,male,69,"[psychologist, neurosurgeon, politician, autho...",[Doctor of Medicine]
12,be pivotal in addressing financial frustrations,Ben Carson,Q816459,1,male,69,"[psychologist, neurosurgeon, politician, autho...",[Doctor of Medicine]
13,We're talking about `Do we want to continue th...,Ben Carson,Q816459,1,male,69,"[psychologist, neurosurgeon, politician, autho...",[Doctor of Medicine]
14,President Trump's State of the Union address p...,Ben Carson,Q816459,1,male,69,"[psychologist, neurosurgeon, politician, autho...",[Doctor of Medicine]
15,reallocating funding and [ prioritizing ] shel...,Ben Carson,Q816459,1,male,69,"[psychologist, neurosurgeon, politician, autho...",[Doctor of Medicine]
...,...,...,...,...,...,...,...,...
716281,My 3 year old is having a rough time,Moshe Marcus,Q15454525,1,male,83,[mathematician],[doctorate]
716452,I think most businesses and certainly the behe...,Edmund Phelps,Q192566,16,male,87,"[economist, university teacher]",[doctorate]
716553,"Every place must identify its strongest, most ...",Robert Merton Solow,Q157268,1,male,96,"[economist, university teacher, professor]",[Doctor of Philosophy]
716813,The Federal Government is of the opinion that ...,Margarete Bause,Q1894938,1,female,61,[politician],[Diplom]


In [11]:
quotes_2015 = pd.read_json(DATA_FOLDER+f'observational-2015.json.bz2')
quotes_2016 = pd.read_json(DATA_FOLDER+f'observational-2016.json.bz2')
quotes_2017_before = pd.read_json(DATA_FOLDER+f'observational-2017-before.json.bz2')
quotes_2017_after = pd.read_json(DATA_FOLDER+f'observational-2017-after.json.bz2')
quotes_2018 = pd.read_json(DATA_FOLDER+f'observational-2018.json.bz2')
quotes_2019 = pd.read_json(DATA_FOLDER+f'observational-2019.json.bz2')
quotes_2020 = pd.read_json(DATA_FOLDER+f'observational-2018.json.bz2')

In [12]:
quotes_2015['label'] = 0
quotes_2016['label'] = 0
quotes_2017_before['label'] = 0
quotes_2017_after['label'] = 1
quotes_2018['label'] = 1
quotes_2019['label'] = 1
quotes_2020['label'] = 1

In [13]:
quotes_2015['year'] = 2015
quotes_2016['year'] = 2016
quotes_2017_before['year'] = 2017
quotes_2017_after['year'] = 2017
quotes_2018['year'] = 2018
quotes_2019['year'] = 2019
quotes_2020['year'] = 2020

In [14]:
quotes = pd.concat([quotes_2015, quotes_2016, quotes_2017_before, 
                    quotes_2017_after, quotes_2018, quotes_2019, quotes_2020])

In [15]:
quotes

Unnamed: 0,quotation,speaker,qids,numOccurrences,gender,age,occupation,academic_degree,label,year
0,I am convinced that this conflict won't be sol...,Angela Merkel,Q567,1,female,61,"[politician, physicist, statesperson, chemist]",[doctorate],0,2015
1,agreed that the E.U. should take further measu...,Angela Merkel,Q567,6,female,61,"[politician, physicist, statesperson, chemist]",[doctorate],0,2015
2,Germany will continue to do everything to supp...,Angela Merkel,Q567,1,female,61,"[politician, physicist, statesperson, chemist]",[doctorate],0,2015
3,There has already been voluntary debt forgiven...,Angela Merkel,Q567,1,female,61,"[politician, physicist, statesperson, chemist]",[doctorate],0,2015
4,They can be lifted if the reasons why they wer...,Angela Merkel,Q567,1,female,61,"[politician, physicist, statesperson, chemist]",[doctorate],0,2015
...,...,...,...,...,...,...,...,...,...,...
329687,Our contribution to WFP will directly address ...,Cyrill Nunn,Q1148757,1,male,60,[diplomat],[doctorate],1,2020
329688,The oligarchs are competing with each another ...,Konstantin Remchukov,Q4393205,1,male,64,"[politician, journalist]",[Candidate of Economic Sciences],1,2020
329689,uncover and prevent many sabotage plots of the...,Nguyen Phu Trong,Q318458,5,male,74,[politician],"[Candidate of Historical Sciences, Doctor of P...",1,2020
329690,We really should look at our planet like a Ros...,Lisa Kaltenegger,Q4511414,5,female,41,"[astronomer, physicist, university teacher]",[doctorate],1,2020


In [16]:
quotes.reset_index(drop=True, inplace = True)

In [17]:
quotes

Unnamed: 0,quotation,speaker,qids,numOccurrences,gender,age,occupation,academic_degree,label,year
0,I am convinced that this conflict won't be sol...,Angela Merkel,Q567,1,female,61,"[politician, physicist, statesperson, chemist]",[doctorate],0,2015
1,agreed that the E.U. should take further measu...,Angela Merkel,Q567,6,female,61,"[politician, physicist, statesperson, chemist]",[doctorate],0,2015
2,Germany will continue to do everything to supp...,Angela Merkel,Q567,1,female,61,"[politician, physicist, statesperson, chemist]",[doctorate],0,2015
3,There has already been voluntary debt forgiven...,Angela Merkel,Q567,1,female,61,"[politician, physicist, statesperson, chemist]",[doctorate],0,2015
4,They can be lifted if the reasons why they wer...,Angela Merkel,Q567,1,female,61,"[politician, physicist, statesperson, chemist]",[doctorate],0,2015
...,...,...,...,...,...,...,...,...,...,...
1768453,Our contribution to WFP will directly address ...,Cyrill Nunn,Q1148757,1,male,60,[diplomat],[doctorate],1,2020
1768454,The oligarchs are competing with each another ...,Konstantin Remchukov,Q4393205,1,male,64,"[politician, journalist]",[Candidate of Economic Sciences],1,2020
1768455,uncover and prevent many sabotage plots of the...,Nguyen Phu Trong,Q318458,5,male,74,[politician],"[Candidate of Historical Sciences, Doctor of P...",1,2020
1768456,We really should look at our planet like a Ros...,Lisa Kaltenegger,Q4511414,5,female,41,"[astronomer, physicist, university teacher]",[doctorate],1,2020


In [18]:
quotes.to_json(DATA_FOLDER + 'data_observational.json.bz2')