In [1]:
import pandas as pd
import numpy as np
import pycountry_convert as pc
from countrygroups import G20
import plotly.express as px


In [2]:
mentions_topics_final = pd.read_csv("mentions_topics_final_fixed.csv")
mentions_topics_final_reps = pd.read_csv("mentions_topics_final_reps_fixed.csv")
mentions_topics_dist = pd.read_csv("topic_dist.csv")


### Creating topic name dataframe

In [3]:
mentions_topics_final_reps = mentions_topics_final_reps.transpose()
mentions_topics_final_reps = mentions_topics_final_reps.iloc[2: , :]
mentions_topics_final_reps = mentions_topics_final_reps.reset_index()
mentions_topics_final_reps = mentions_topics_final_reps.rename(columns={'index': 'Topic'}) 
# mentions_topics_final_reps

In [4]:
by_topic_count = mentions_topics_final.groupby(['Topic','Name']).agg(topic_count = ('text','count'))
by_topic_count['%_total'] = by_topic_count['topic_count'] / by_topic_count['topic_count'].sum()*100
by_topic_count = by_topic_count.reset_index()
# by_topic_count

In [5]:
mentions_topics_final_reps['Topic'] = mentions_topics_final_reps['Topic'].astype(int) 
by_topic_count['Topic'] = by_topic_count['Topic'].astype(int) 

df_topics = mentions_topics_final_reps.merge(by_topic_count, on = 'Topic',how='inner')

df_topics = df_topics.rename(columns={'0': 'Document1','1': 'Document2','2': 'Document3','Name': 'Top_words'})
#df_topics
# df_topics.to_csv('mentions_topics_final_reps_fixed2.csv')

In [6]:
equiv_gpt = {0:"Climate Change Impacts", 
        1:"Carbon Accounting in Forest Management",
        2:"Livestock Manure Management",
        3:"Waste Management and Treatment",
        4:"Uncertainty Assessment in Emissions Inventory",
        5:"Citation of IPCC Guidelines Equations",
             
        6:"Energy Fuel Consumption",
        7:"Global Warming Potential (GWP) Reporting",
        8:"Default Emission Factor Calculation (2006 IPCC Guidelines)",
        9:"National Greenhouse Gas Inventories Guidelines",
        10:"National GHG Inventory Compilation",
             
        11:"National GHG Inventory Development",
        12:"Default CH4 Emission Factors",
        13:"Greenhouse Gas Emissions by Sector (2017)",
        14:"Tiered Method for Emission Factor Calculation",
        15:"QA/QC Procedures in Inventory Compilation",
             
        16:"Greenhouse Gas Inventory Composition",
        17:"Refrigeration and Air Conditioning HFC Usage",
        18:"Key Categories Analysis",
        19:"Agriculture, Forestry, and Other Land Use (AFOLU) Sector Analysis",
        20:"Industrial Carbonate Calcination Processes",
             
        21:"Cement Clinker Production Emissions",
        22:"Quality Control for Activity Data and Emission Factors",
        23:"Estimating GHG Emissions using IPCC Tier 1 Approach",
        24:"GHG Emissions from Iron and Steel Production",
        25:"GHG Emissions Estimation: Reference vs Sectoral Approach",
             
        26:"GHG Key Category Analysis: IPCC Tier 2 Level",
        27:"Inventory Completeness Assessment and Software Utilization",
        28:"Wetlands Emission Estimation and IPCC Supplement Utilization"}
df_topics["topic_phrase_gpt"] = df_topics["Topic"].map(equiv_gpt)

In [7]:
equiv_manual = {0:"Climate Change Scenarios and Impacts", 
        1:"Emissions from Forest Management",
        2:"Emissions from Livestock Manure Management",
        3:"Emissions from Solid Waste Disposal",
        4:"Uncertainties in Inventory Estimates",
        5:"Citations of 2006 IPCC guidelines",
         
        6:"Emissions from Fuels & Energy",
        7:"Global Warming Potential Values",
        8:"Emission Factors from 2006 IPCC Guidelines",
        9:"National Greenhouse Gas Inventories 1",
        10:"National Greenhouse Gas Inventories 2",
         
        11:"National Greenhouse Gas Inventories 3",
        12:"Default values for CO2 and CH4 Emission Factors",
        13:"2017 Greenhouse Gas Emissions",
        14:"Tier methods for Estimating Inventory",
        15:"QA/QC Procedures in Inventory Compilation",
         
        16:"Greenhouse Gas Inventory Composition",
        17:"Emissions from Refridgeration and Hydroflourocarbons (HFCs)",
        18:"Key Categories Analysis",
        19:"Emissions from Agriculture, Forestry, and Other Land Use (AFOLU)",
        20:"Emissions from Industrial Carbonate Calcination Processes",
         
        21:"Emissions from Cement Clinker Production",
        22:"Activity Data and Emission Factors",
        23:"Tier approaches for Estimating Inventory",
        24:"Emissions from Iron and Steel Production",
        25:"GHG Emissions Estimation: Reference vs Sectoral Approach",
         
        26:"Key Category Level Analysis",
        27:"Inventory Completeness Assessment",
        28:"Emissions from Wetlands Supplement"}
df_topics["topic_phrase_manual"] = df_topics["Topic"].map(equiv_manual)

In [8]:
equiv_manual_final = {0:"Climate Change Scenarios and Impacts", 
        1:"Emissions from Forest Management",
        2:"Emissions from Livestock Manure Management",
        3:"Emissions from Solid Waste Disposal",
        4:"Uncertainties in Inventory Estimates",
        5:"Citations of 2006 IPCC guidelines",
         
        6:"Emissions from Fuels & Energy",
        7:"Global Warming Potential Values",
        8:"Emission Factors from 2006 IPCC Guidelines",
        9:"National Greenhouse Gas Inventories",
        10:"National Greenhouse Gas Inventories",
         
        11:"National Greenhouse Gas Inventories",
        12:"Default values for CO2 and CH4 Emission Factors",
        13:"2017 Greenhouse Gas Emissions",
        14:"Tier methods/approaches for Estimating Inventory",
        15:"QA/QC Procedures in Inventory Compilation",
         
        16:"Greenhouse Gas Inventory Composition",
        17:"Emissions from Refridgeration and Hydroflourocarbons (HFCs)",
        18:"Key Categories Analysis",
        19:"Emissions from Agriculture, Forestry, and Other Land Use (AFOLU)",
        20:"Emissions from Industrial Carbonate Calcination Processes",
         
        21:"Emissions from Cement Clinker Production",
        22:"Activity Data and Emission Factors",
        23:"Tier methods/approaches for Estimating Inventory",
        24:"Emissions from Iron and Steel Production",
        25:"GHG Emissions Estimation: Reference vs Sectoral Approach",
         
        26:"Key Categories Analysis",
        27:"Inventory Completeness Assessment",
        28:"Emissions from Wetlands Supplement"}
df_topics["topic_phrase_manual_final"] = df_topics["Topic"].map(equiv_manual_final)

In [9]:
equiv_manual_final_groups = {0:"Climate Change Scenarios and Impacts", 
        1:"GHG Emissions - Agriculture & LULUCF",
        2:"GHG Emissions - Agriculture & LULUCF",
        3:"GHG Emissions - Waste",
        4:"GHG Emisions Measurement & Methods (General)",
        5:"GHG Emisions Measurement & Methods (General)",
         
        6:"GHG Emissions - Energy",
        7:"GHG Emisions Measurement & Methods (General)",
        8:"GHG Emisions Measurement & Methods (General)",
        9:"GHG Emisions Measurement & Methods (General)",
        10:"GHG Emisions Measurement & Methods (General)",
         
        11:"GHG Emisions Measurement & Methods (General)",
        12:"GHG Emisions Measurement & Methods (General)",
        13:"GHG Emisions Measurement & Methods (General)",
        14:"GHG Emisions Measurement & Methods (General)",
        15:"GHG Emisions Measurement & Methods (General)",
         
        16:"GHG Emisions Measurement & Methods (General)",
        17:"GHG Emissions - IPPU",
        18:"GHG Emisions Measurement & Methods (General)",
        19:"GHG Emissions - Agriculture & LULUCF",
        20:"GHG Emissions - IPPU",
         
        21:"GHG Emissions - IPPU",
        22:"GHG Emisions Measurement & Methods (General)",
        23:"GHG Emisions Measurement & Methods (General)",
        24:"GHG Emissions - IPPU",
        25:"GHG Emisions Measurement & Methods (General)",
         
        26:"GHG Emisions Measurement & Methods (General)",
        27:"GHG Emisions Measurement & Methods (General)",
        28:"GHG Emissions - Agriculture & LULUCF"}
df_topics["topic_phrase_manual_final_group"] = df_topics["Topic"].map(equiv_manual_final_groups)

In [10]:
df_topics['Top_words'] = df_topics['Top_words'].str.replace('_',', ').str.split(',', n=1).str.get(-1)

In [11]:
mentions_topics_dist
mentions_topics_dist = mentions_topics_dist.rename(columns={'Unnamed: 0': 'Topic','count_x': 'n_before','count_y': 'n_after'})
mentions_topics_dist

df_topics = df_topics.merge(mentions_topics_dist, on = 'Topic',how='right')

In [13]:
df_topics['n_diff'] = df_topics['n_after'] - df_topics['n_before']
# table = df_topics[['topic_phrase_manual_final','topic_phrase_manual','topic_phrase_gpt','topic_phrase_manual_final_group','Top_words',0,1,2]]
table = df_topics[['topic_phrase_gpt','topic_phrase_manual','topic_phrase_manual_final','topic_phrase_manual_final_group','Top_words',0]]
# table = df_topics[['topic_phrase_manual_final','n_before','n_after','n_diff']]

table.to_latex()

'\\begin{tabular}{lllllll}\n\\toprule\n & topic_phrase_gpt & topic_phrase_manual & topic_phrase_manual_final & topic_phrase_manual_final_group & Top_words & 0 \\\\\n\\midrule\n0 & NaN & NaN & NaN & NaN & NaN & NaN \\\\\n1 & Climate Change Impacts & Climate Change Scenarios and Impacts & Climate Change Scenarios and Impacts & Climate Change Scenarios and Impacts &  global, scenarios, report, temperature & Under its sixth assessment cycle, the IPCC released a series of special reports on global warming and its impacts in 2018 and 2019. According to these reports, human activities have been responsible for approximately 1.0°C of global warming since pre-industrial times, and temperature rise is likely to breach 1.5°C between 2030 and 2052 at current rates of warming. More intense and frequent climate and weather extremes have been observed. Warming from historical anthropogenic emissions since the pre-industrial period will continue to drive long-term shifts in the climate system such as 

In [14]:
table

Unnamed: 0,topic_phrase_gpt,topic_phrase_manual,topic_phrase_manual_final,topic_phrase_manual_final_group,Top_words,0
0,,,,,,
1,Climate Change Impacts,Climate Change Scenarios and Impacts,Climate Change Scenarios and Impacts,Climate Change Scenarios and Impacts,"global, scenarios, report, temperature","Under its sixth assessment cycle, the IPCC rel..."
2,Carbon Accounting in Forest Management,Emissions from Forest Management,Emissions from Forest Management,GHG Emissions - Agriculture & LULUCF,"land, forest, biomass, carbon",Methodologies applied in this national invento...
3,Livestock Manure Management,Emissions from Livestock Manure Management,Emissions from Livestock Manure Management,GHG Emissions - Agriculture & LULUCF,"manure, management, livestock, cattle",The methodology used to estimate the emission ...
4,Waste Management and Treatment,Emissions from Solid Waste Disposal,Emissions from Solid Waste Disposal,GHG Emissions - Waste,"waste, wastewater, solid, treatment",Based on the country context and the data avai...
5,Uncertainty Assessment in Emissions Inventory,Uncertainties in Inventory Estimates,Uncertainties in Inventory Estimates,GHG Emisions Measurement & Methods (General),"uncertainty, uncertainties, data, activity",The uncertainty of the LULUCF sector activity ...
6,Citation of IPCC Guidelines Equations,Citations of 2006 IPCC guidelines,Citations of 2006 IPCC guidelines,GHG Emisions Measurement & Methods (General),"2006, chapter, volume, page",Equation 3.2 (chapter 3 of the 2006 IPCC guide...
7,Energy Fuel Consumption,Emissions from Fuels & Energy,Emissions from Fuels & Energy,GHG Emissions - Energy,"fuels, energy, fuel, consumption",According to the guidelines of the 2006 IPCC G...
8,Global Warming Potential (GWP) Reporting,Global Warming Potential Values,Global Warming Potential Values,GHG Emisions Measurement & Methods (General),"gwp, warming, report, global","The estimated CH4, N₂O, HFCs and SF emissions ..."
9,Default Emission Factor Calculation (2006 IPCC...,Emission Factors from 2006 IPCC Guidelines,Emission Factors from 2006 IPCC Guidelines,GHG Emisions Measurement & Methods (General),"factors, default, 2006, calculation",Emission factors are used by default from the ...


### Map topic names to main dataframe

In [None]:
# Map topic phrases back to main frame
mentions_topics_final = mentions_topics_final.merge(df_topics[['Topic','topic_phrase_manual_final','topic_phrase_manual_final_group']], on = 'Topic',how='left')
mentions_topics_final = mentions_topics_final.rename({'topic_phrase_manual_final': 'Topic Phrase'}, axis='columns')
mentions_topics_final = mentions_topics_final.rename({'topic_phrase_manual_final_group': 'Topic Phrase Group'}, axis='columns')

#mentions_topics_final
len(mentions_topics_final)

In [None]:
mentions_topics_final['Topic Phrase'].nunique()

### Analysis - By Topic

In [None]:
by_topic = (100.0*mentions_topics_final.groupby('Topic Phrase')['text'].size().sort_values(ascending= False)/len(mentions_topics_final)).round(1)
print(by_topic.to_latex())

In [None]:
top10topics_df = by_topic[0:10,].reset_index()
top10topics = top10topics_df.iloc[:,0]
top10topics =top10topics.tolist()

In [None]:
mentions_topics_final_top10 = mentions_topics_final[mentions_topics_final['Topic Phrase'].isin(top10topics)]
len(mentions_topics_final_top10)/len(mentions_topics_final)*100.0

In [None]:
mentions_topics_final_top10 = mentions_topics_final_top10[mentions_topics_final_top10['year']>2006]

total_counts_per_year = mentions_topics_final.groupby('year')['text'].size()

share_of_total = 100*mentions_topics_final_top10.groupby(['year', 'Topic Phrase'])['text'].size() / total_counts_per_year

#mentions_topics_final_top10.groupby(['year','topic_phrase'])['text'].size().unstack().plot(kind='line')
share_of_total.unstack().plot(figsize=(15,6),
                              kind='line',
                              #kind='bar',
                              #stacked =True,
                              xlabel='Year',
                              ylabel='Share of Rows in each Year(%)')

#### Top 20 topics by Continent

In [None]:
# Exclude null
mentions_topics_final = mentions_topics_final[mentions_topics_final['Continent']!="null"]
mentions_topics_final_top10 = mentions_topics_final_top10[mentions_topics_final_top10['Continent']!="null"]

total_counts_per_cont = mentions_topics_final[mentions_topics_final['Continent'] != "null"].groupby('Continent')['text'].size()

share_of_total = 100*mentions_topics_final_top10.groupby(['Topic Phrase','Continent'])['text'].size() / total_counts_per_cont

#mentions_topics_final_top10.groupby(['year','topic_phrase'])['text'].size().unstack().plot(kind='line')
share_of_total.unstack().plot(figsize=(15,6),
                              kind='bar',
                              stacked =False,
                              legend = 'reverse',
                              xlabel='Continent',
                              ylabel='Share of Total Rows from given Continent (%)')

### By group topics


In [None]:
by_topic_groups2 = (100.0*mentions_topics_final.groupby(['Topic Phrase Group','Topic Phrase'])['text'].size()/len(mentions_topics_final)).round(1)
by_topic_groups2.to_latex()            

In [None]:
df_topicsresults = pd.DataFrame(by_topic_groups2.ffill(limit=None))
df_topicsresults = df_topicsresults.reset_index()
df_topicsresults = df_topicsresults.rename(columns={'text': 'topic share'})
df_topicsresults
# Need to add a total line??

In [None]:
# df = px.data.gapminder().query("year == 2007")
fig = px.treemap(df_topicsresults, path=['Topic Phrase Group', 'Topic Phrase'], values='topic share',
                 color='Topic Phrase Group',
                 color_discrete_map={'Climate Change Scenarios and Impacts':'#1F77B4',
                                      'GHG Emisions Measurement & Methods (General)':'#FF7F0E',
                                     'GHG Emissions - Agriculture & LULUCF':'#2CA02C',
                                     'GHG Emissions - Energy':'#D62728',
                                     'GHG Emissions - IPPU':'#9467BD',
                                      'GHG Emissions - Waste':'#8C564B'})
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))

# Manually adjust figure labels
for i in range(0,len(fig.data[0]['labels'])):
    label = fig.data[0]['labels'][i]
    if len(label.split())-1 >= 3:
        new_label = ' '.join(label.split()[:3]) + " <br>" + ' '.join(label.split()[3:len(label.split())])
        fig.data[0]['labels'][i] = new_label
fig.data[0]['labels'][11] = 'Climate Change Scenarios and Impacts'
fig.data[0]['labels'][12] = 'Citations of 2006 IPCC guidelines'

fig.data[0]['textfont']['size'] = 20
fig.show()
#fig.write_image("topic_treemap.png") 


In [None]:
by_topic_groups = (100.0*mentions_topics_final.groupby('Topic Phrase Group')['text'].size().sort_values(ascending= False)/len(mentions_topics_final)).round(1)
print(by_topic_groups.to_latex())

In [None]:
total_counts = mentions_topics_final.groupby('year')['text'].size()
topics_per_year = mentions_topics_final.groupby(['year','Topic Phrase Group'])['text'].size()
share_of_total = 100.0*topics_per_year / total_counts

fig = topics_per_year.unstack().plot(figsize=(15,6),
                              #kind='line',
                              kind='bar',
                              stacked =True,
                              xlabel='Year',
                              ylabel='Number of Rows')
#fig.invert_yaxis()
fig.legend(loc='center left',bbox_to_anchor=(1.0, 0.5))

In [None]:
total_counts = mentions_topics_final.groupby('types0')['text'].size()
topics_per_type = mentions_topics_final.groupby(['types0','Topic Phrase Group'])['text'].size()
share_of_total = 100.0*topics_per_type / total_counts

fig = topics_per_type.unstack().plot(figsize=(15,6),
                              #kind='line',
                              kind='barh',
                              stacked =True,
                              xlabel='Number of Rows',
                              ylabel='Document Type')
fig.invert_yaxis()
fig.legend(loc='center left',bbox_to_anchor=(1.0, 0.5))
#fig.savefig('topic_doc2.png', dpi=300)


In [None]:
total_counts = mentions_topics_final.groupby('types0')['text'].size()
topics_per_type = len(mentions_topics_final)
share_of_total = 100.0* total_counts/topics_per_type
share_of_total

In [None]:
g20_list = G20

# G20 countries - National Comm
subset = mentions_topics_final[(mentions_topics_final['geography_iso'].isin(G20)) & (mentions_topics_final['types0']=='National Communication')]

total_counts = subset.groupby('geography_iso')['text'].size()
topics_per_country = subset.groupby(['geography_iso','Topic Phrase Group'])['text'].size()
share_of_total = 100.0*topics_per_country / total_counts

fig = share_of_total.unstack().plot(figsize=(15,6),
                              #kind='line',
                              kind='bar',
                              stacked =True,
                              xlabel='G20 country',
                              ylabel='Share of Rows (%)')
fig.invert_yaxis()
fig.legend(loc='center left',bbox_to_anchor=(1.0, 0.5))

In [None]:
# G20 countries - National Inventory Report
subset = mentions_topics_final[(mentions_topics_final['geography_iso'].isin(G20)) & (mentions_topics_final['types0']=='National Inventory Report')]

total_counts = subset.groupby('geography_iso')['text'].size()
topics_per_country = subset.groupby(['geography_iso','Topic Phrase Group'])['text'].size()
share_of_total = 100.0*topics_per_country / total_counts

fig = share_of_total.unstack().plot(figsize=(15,6),
                              #kind='line',
                              kind='bar',
                              stacked =True,
                              xlabel='Share of Rows (%)',
                              ylabel='Document Type')
fig.invert_yaxis()
fig.legend(loc='center left',bbox_to_anchor=(1.0, 0.5))

#### Visualise first topic

In [None]:
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

#text = " ".join(txt for txt in mentions_topics_final[mentions_topics_final['Topic Phrase Group']=='GHG Emisions Measurement & Methods (General)']['text'])
#text = " ".join(txt for txt in mentions_topics_final[mentions_topics_final['Topic Phrase Group']=='GHG Emissions - Agriculture & LULUCF']['text'])
#text = " ".join(txt for txt in mentions_topics_final[mentions_topics_final['Topic Phrase Group']=='Climate Change Scenarios and Impacts']['text'])

#text = " ".join(txt for txt in mentions_topics_final[mentions_topics_final['Topic Phrase']=='Emissions from Forest Management']['text'])
text = " ".join(txt for txt in mentions_topics_final[mentions_topics_final['Topic Phrase']=='National Greenhouse Gas Inventories']['text'])

# Create stopword list:
stopwords = set(STOPWORDS)
# Same stopword list that is used for topic modelling
stopwords.update(['et','al','institute','university','climate','change','box','figure','table','ipcc','emission','emissions','guidelines','C'])


x, y = np.ogrid[:300, :300]

#mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)

# lower max_font_size, change the maximum number of word and lighten the background:
wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=100, background_color="white",mask=mask).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
import re

def countOccurrences(string, w):
    string = str(string)
    # Get the regex to be checked
    regexPattern = "\\b" + w + "\\b"
     
    # Variable to count total occurrences of the given word
    count = 0
     
    for m in re.finditer(regexPattern, string, re.IGNORECASE):
        # Increment count
        count += 1
         
    # Print the occurrences of the word
    #print(count)
    return count