In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import plotly.express as px
import pycountry_convert as pc

In [3]:
metadata = pd.read_csv("metadata.csv")

# Using huggingface (easiest)
dataset = load_dataset("ClimatePolicyRadar/global-stocktake-documents")

# Using pandas
link = r'/Users/jiaminlim/Documents/Research/climate_nlp/global-stocktake-documents/full_text.parquet'
df = pd.read_parquet(link)

## Initial cleaning (full)

In [None]:
# TODO 
# Check for duplicates?
# XAB - India and Bhutan
# http://example.com

In [4]:
# Select relevant columns that you want
df = df[['geography_iso','author','document_name','date','type','type_confidence','text','text_block_id','types','document_source_url','translated']]

In [4]:
df['text'] = df['text'].str.replace('\n',' ')
df['text']  = df['text'].astype('str')

In [5]:
df["document_name"] =  df["document_name"].str.replace('\r','')

In [6]:
df['types0'] = df.types.apply(lambda x: x[0])
df['n_types'] = df.types.apply(lambda x: len(x)) # maximum two 'types'
df['types1'] = df.types.apply(lambda x: x[1] if len(x)>1 else 'n/a')

In [7]:
df['types0'] = df['types0'].replace('Global Stocktake Synthesis Reports', 'Global Stocktake Synthesis Report', regex=True)
df['types0'] = df['types0'].replace('National Communcation', 'National Communication', regex=True)

In [8]:
df = df.drop('types', axis=1)

In [9]:
df['n_author'] = df.author.apply(lambda x: len(x))
df['first_author'] = df.author.apply(lambda x: x[0])
df['is_author_country'] = np.where(df['geography_iso'] != 'nan', True, False)

In [10]:
# Date - add year and m-y column
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['month_year'] = df['date'].dt.to_period('M')
df["date_document_name"] = df["date"].astype(str) + " - " + df["document_name"]

In [11]:
df['n_words'] =  df['text'].str.count(' ') + 1
df['bool_ipcc'] =  df['text'].str.contains("IPCC")

In [12]:
df['text_startsLC'] = df['text'].astype(str).str[0].str.islower()

In [13]:
df['text_startsLC'].value_counts()

text_startsLC
False    1110110
True      131762
Name: count, dtype: int64

In [14]:
# Add country names and continents
def country_to_continent(country_alpha3):
    try:
        # country_name = pc.country_alpha3_to_country_name(country_alpha3)
        country_alpha2 = pc.country_alpha3_to_country_alpha2(country_alpha3)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    except:
            if country_alpha3 =='TLS':
                country_continent_name = "Asia"
            elif country_alpha3 =='UAE':
                country_continent_name = "Asia"
            elif country_alpha3 =='VAT':
                country_continent_name = "Europe"
            elif country_alpha3 =='EUR':
                country_continent_name = "Europe"
            else:
                country_continent_name = "null"
    return country_continent_name

In [15]:
def countrycode_to_name(country_alpha3):
    try:
        country_alpha2 = pc.country_alpha3_to_country_alpha2(country_alpha3)
        #print(country_alpha2)
        country_name = pc.country_alpha2_to_country_name(country_alpha2)
    except:
        country_name = "null"
    return country_name

In [16]:
df['Continent'] = df['geography_iso'].map(country_to_continent)

In [17]:
df['geography_iso_name'] = df['geography_iso'].map(countrycode_to_name)

## Initial checks (full)

In [None]:
df.isna().sum()

In [None]:
df[df['geography_iso']== 'XAB']['first_author'].value_counts()

### Word count

In [None]:
df['n_words'].hist(bins=100)
max(df['n_words'])

In [None]:
df_long = pd.DataFrame(df[df['n_words']>1000])
# pd.set_option('display.max_colwidth', None)
# df_long

In [None]:
# Words per document
wordcount = pd.DataFrame(df.groupby('date_document_name')['n_words'].sum())
#wordcount['n_words'].hist(bins=100)

### Text count

In [None]:
textcount = pd.DataFrame(df.groupby('geography_iso')['text'].nunique())
textcount = textcount.rename(columns={'text': 'n_text'})
textcount = textcount.reset_index()
i = textcount[((textcount.geography_iso == 'nan') &( textcount.n_text == 110271))].index
textcount = textcount.drop(i)
textcount.sort_values(['n_text'],ascending=False).head(20)

In [None]:
fig = px.choropleth(textcount, locations="geography_iso",
                    color="n_text", 
                    #hover_name="geography_iso", 
                    title="Number of Text Rows by Country",
                    color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

In [None]:
textcount_time = df.groupby('year')['text'].nunique()
textcount_time.plot.bar()

### Document count

In [None]:
# Unique documents per geography and report type
print(df['document_name'].nunique())
pivot_doccount= pd.pivot_table(data=df,index='geography_iso',columns='types0',values='document_name',aggfunc=lambda x: len(x.unique()),margins=True)
pivot_doccount

In [None]:
doccount = pd.DataFrame(df.groupby('geography_iso')['document_name'].nunique())
doccount = doccount.rename(columns={'document_name': 'n_docs'})
doccount = doccount.reset_index()
i = doccount[((doccount.geography_iso == 'nan') &( doccount.n_docs == 512))].index
doccount = doccount.drop(i)
#docname.sort_values(['n_docs'],ascending=False).head(20)

In [None]:
fig = px.choropleth(doccount, locations="geography_iso",
                    color="n_docs", 
                    #hover_name="geography_iso", 
                    title="Number of Documents by Country",
                    color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

In [None]:
doccount_type = pd.DataFrame(df.groupby('types0')['document_name'].nunique())
doccount_type.sort_values('document_name').plot.barh(legend=None)

In [None]:
doccount_time = df.groupby('year')['document_name'].nunique()
doccount_time.plot.bar()

### Type

In [None]:
# Type seems to correspond to type (e.g. text, figure)
df['type_confidence'].hist(bins=80)

### Translated

In [None]:
df['translated'].value_counts()

In [None]:
len(df)

## Data sub-sets

In [18]:
filter_ipccmentions = (df['bool_ipcc']==True)
filter_ipccreport = (df['types0']=='IPCC Report')

In [19]:
# Filters: References to IPCC reports
ref0 = (df['text'].str.startswith("IPCC."))
ref1 = (df['text'].str.startswith("IPCC ("))
ref2 = (df['text'].str.startswith("IPCC, "))
ref3 = (df['text'].str.startswith("Intergovernmental Panel on Climate Change (IPCC). "))
ref4 = (df['text'].str.contains("Contribution of Working Group"))
ref5 = (df['text'].str.contains("IPCC Special Report on the impacts of global warming of 1.5Â°C above pre"))

refs = (ref0 | ref1 | ref2 | ref3 | ref4 | ref5)
not_refs = (~ref0 & ~ref1 & ~ref2 & ~ref3 & ~ref4 & ~ref5)

In [20]:
# Filters: Body text
body0 = (~df['text'].str.startswith("Source:"))
body1 = (~df['text'].str.startswith("Legend: "))
body2 = (~df['text'].str.startswith("*"))
body3 = (df['type']=='Text')
body4 = (df['n_words']>10)
bodys = body0 & body1 & body1 & body2 & body3 & body4

In [21]:
isauthorcountry = (df['is_author_country'] == True)

## Dataset subset: Country Submissions

In [22]:
df_countrysubmissions = df[isauthorcountry]
df_countrysubmissions.to_csv('df_countrysubmissions.csv')
len(df_countrysubmissions)

1102177

## Dataset subset: IPCC mentions in Country Submissions

In [23]:
df_ipccmentions = df[(filter_ipccmentions)& isauthorcountry]
len(df_ipccmentions)

25948

In [24]:
df_ipccmentions['document_name'].nunique()

794

### Body

In [25]:
df_ipccmentions_body = df[(filter_ipccmentions)
                    & isauthorcountry
                    & not_refs
                    & bodys]
len(df_ipccmentions_body)

18848

In [26]:
df_ipccmentions_body['document_name'].nunique()

750

In [27]:
df_ipccmentions_body.to_csv("df_ipccmentions_body.csv")
df_ipccmentions_body_readin = pd.read_csv("df_ipccmentions_body.csv")
len(df_ipccmentions_body_readin)

18848

In [28]:
df_ipccmentions_body['text_startsLC'].value_counts()

text_startsLC
False    17688
True      1160
Name: count, dtype: int64

### References to IPCC report

In [29]:
df_ipccmentions_ipccrefs = df[(filter_ipccmentions)
                    & isauthorcountry
                    & refs]
len(df_ipccmentions_ipccrefs)

817

In [30]:
df_ipccmentions_ipccrefs.to_csv('df_ipccmentions_ipccrefs.csv')
df_ipccmentions_ipccrefs_readin = pd.read_csv("df_ipccmentions_ipccrefs.csv")
len(df_ipccmentions_ipccrefs_readin)

817

In [31]:
df_ipccmentions_ipccrefs['document_name'].nunique()

243

### Other (to check)

## Dataset subset: IPCC report

In [32]:
df_ipccreport = df[(filter_ipccreport)]
len(df_ipccreport)

37745

In [33]:
df_ipccreport['document_name'].nunique()

52

### Body

In [34]:
r0 = df_ipccreport['text'].str.contains("Association of Polar Early Career Scientist \(APECS\)")
r1 = df_ipccreport['text'].str.contains("YESS \(Young Earth System Scientists community\)")
r2 = df_ipccreport['text'].str.contains("United Kingdom \(of Great Britain and Northern Ireland\)")
r3 = df_ipccreport['text'].str.contains("Permafrost Young Research Network \(PYRN\)")
r4 = df_ipccreport['text'].str.contains("\(USA\)")
r5 = df_ipccreport['text'].str.contains("\(United States of America\)")
r6 = df_ipccreport['text'].str.contains("\(The United States of America\)")
r7 = df_ipccreport['text'].str.contains("\(the United States of America\)")
r8 = df_ipccreport['text'].str.contains("\(United Kingdom\)")
r9 = df_ipccreport['text'].str.contains("\(France\)")
r10 = df_ipccreport['text'].str.contains("\(Germany\)")
remove_authors = r0 | r1 | r2 | r3 | r4 | r5 | r6 | r7 | r8 | r9 | r10

In [35]:
df_ipccreport_body = df[(filter_ipccreport)
                    & not_refs
                    & bodys
                    & ~remove_authors]
len(df_ipccreport_body)

16666

In [36]:
df_ipccreport_body.to_csv('df_ipccreport_body.csv')
df_ipccreport_body_readin = pd.read_csv("df_ipccreport_body.csv")
len(df_ipccreport_body_readin)

16666

In [37]:
df_ipccreport_body['document_name'].nunique()

49

In [38]:
df_ipccreport_body['text_startsLC'].value_counts()

text_startsLC
False    14389
True      2277
Name: count, dtype: int64

### References to IPCC report

In [39]:
df_ipccreport_ipccrefs = df[(filter_ipccreport)
                    & refs]
len(df_ipccreport_ipccrefs)

210

In [40]:
df_ipccreport_ipccrefs['document_name'].nunique()

50

### Other (to check)

## Summary tables and figures

In [None]:
# Summary stats
# words per row
df["n_words"].describe()

In [None]:
# row per document
df_rowperdoc = df.groupby('document_name')['text'].count()
df_rowperdoc.describe()
#df_rowperdoc.sort_values(ascending=False).head(5)


In [None]:
# words per country
df_rowperdoc = df[df['is_author_country'] == True].groupby('geography_iso')['text'].count()
df_rowperdoc.describe()

In [None]:
# documents per country
df_rowperdoc = df[df['is_author_country'] == True].groupby('geography_iso')['document_name'].nunique()
df_rowperdoc.describe()