## Converting Quotes into Labeled Data for Binary Classification Finetuning (climate/non-climate)

In [1]:
import pandas as pd
from datasets import load_dataset

### Loading the Data

Loading and converting the Gigawords and converting the parliament data into Pandas..

In [2]:
dataset = load_dataset("DDSC/partial-danish-gigaword-no-twitter", split='train[0%:30%]')
ds_ft = dataset.filter(lambda row: row['source']==('ft'))
df_ft = ds_ft.to_pandas()

Using custom data configuration DDSC--partial-danish-gigaword-no-twitter-c4bf5dfabf58d25e
Reusing dataset parquet (/home/gugy/.cache/huggingface/datasets/parquet/DDSC--partial-danish-gigaword-no-twitter-c4bf5dfabf58d25e/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/202 [00:00<?, ?ba/s]

Some Regular Expressions to find the climate related material

In [3]:
import re

#Some Danish climate-related regex:
#klima = '(K|k)limaet)'
klima_forandringer = '(K|k)lima(forandringer)(ne)?|global(e)? opvarmning|(K|k)limakrise(n)|klimabelastende?'
co2 = 'CO 2|CO2|CO_2|Co_2|CO2|Kuldioxid|drivhusgas(udledning(en)?)?|kulstofsudledningen'
målsætning = 'grøn(ne)? omstilling|CO 2-reduction|CO 2-neutral'
politik = 'Paris-aftalen|COP\d{1,2}|CO 2-afgift|CO 2-skat'
løsninger = 'klimaløsning|CO 2-nedsættende'

complete_klima_regex = re.compile("(%s|%s|%s|%s)" % (klima_forandringer, co2, målsætning, politik))

complete_klima_regex

re.compile(r'((K|k)lima(forandringer)(ne)?|global(e)? opvarmning|(K|k)limakrise(n)|klimabelastende?|CO 2|CO2|CO_2|Co_2|CO2|Kuldioxid|drivhusgas(udledning(en)?)?|kulstofsudledningen|grøn(ne)? omstilling|CO 2-reduction|CO 2-neutral|Paris-aftalen|COP\d{1,2}|CO 2-afgift|CO 2-skat)',
re.UNICODE)

In [4]:
test = "Vi har fundet flere CO 2-kvoter her end vi havde regnet med. Det her er et klart tegn på Klimaforandringernes virkning.\
        Klimaforandringerne. forandringerne har været slemme. Men det grønne omstilling kan hjælpe os på vej. grøn omstilling \
        er vejen frem from at få nedsat CO  2-udledningen. CO 2-afgiften er slem"
        
found = re.findall(complete_klima_regex, test)

In [5]:
climate_docs = []

climate_quotes = {'text': [],
      'speaker_id': [],
      'doc_id': [],
      'year': [],
       }

In [6]:
for index, row in df_ft.iterrows():

        if len(re.findall(complete_klima_regex, row.text)) > 50:
            doc_id = row.doc_id
            climate_docs.append(doc_id)
            
            quotes = row.text.split("\n")    
            
            for quote in quotes:
                
                if re.search(complete_klima_regex, quote):
                    climate_quotes['text'].append(quote.split(":")[1].strip())
                    climate_quotes['speaker_id'].append(quote.split(":")[0].strip())
                    climate_quotes['doc_id'].append(doc_id)
                    climate_quotes['year'].append(re.search(r'\d{4}', doc_id)[0])
            
            print(doc_id)

ft_20171M38
ft_20181M65
ft_20161M3
ft_20101M59
ft_20191M32
ft_20201M3
ft_20181M92
ft_20151M53
ft_20181M3
ft_20091M61
ft_20191M50
ft_20191M3
ft_20131M97
ft_20171M104
ft_20191M146
ft_20161M101
ft_20191M8
ft_20201M66
ft_20091M49
ft_20161M14
ft_20191M110
ft_20161M48
ft_20131M69
ft_20181M39
ft_20191M64
ft_20171M3
ft_20201M78
ft_20161M6
ft_20091M93
ft_20161M81
ft_20201M52
ft_20161M24
ft_20101M45
ft_20131M95
ft_20191M134
ft_20161M95
ft_20191M39
ft_20151M26
ft_20101M94
ft_20161M112
ft_20121M97
ft_20201M21
ft_20191M128
ft_20151M33
ft_20171M48
ft_20201M41
ft_20191M44
ft_20201M42
ft_20201M79
ft_20171M108
ft_20181M53
ft_20181M81
ft_20091M43
ft_20151M3
ft_20201M22
ft_20151M93
ft_20161M105
ft_20131M41
ft_20091M68
ft_20141M47
ft_20141M62
ft_20141M16
ft_20091M13
ft_20191M57


In [7]:
full_quotes_climate_sample = pd.DataFrame(climate_quotes)

len(full_quotes_climate_sample)

full_quotes_climate_sample


Unnamed: 0,text,speaker_id,doc_id,year
0,"Jeg er i hvert fald utrolig glad for, at vi me...",TALER 96,ft_20171M38,2017
1,Vi vil jo især gerne – ud over selvfølgelig he...,TALER 96,ft_20171M38,2017
2,"Det kunne også være, at det er, fordi man tænk...",TALER 23,ft_20171M38,2017
3,"Vi tager det sådan set meget alvorligt, at vi ...",TALER 23,ft_20171M38,2017
4,"Man kunne jo godt ønske sig, at man havde mere...",TALER 137,ft_20171M38,2017
...,...,...,...,...
2492,"Det anerkender jeg simpelt hen ikke er måden, ...",TALER 515,ft_20191M57,2019
2493,"Jeg ser mig lige omkring for at se, om der er ...",TALER 482,ft_20191M57,2019
2494,"Jeg tror, vi skal være meget forsigtige som po...",TALER 482,ft_20191M57,2019
2495,Tak for det – ligefrem højtærede. Jeg er i øvr...,TALER 493,ft_20191M57,2019


In [None]:
import random

#Using seed to ensure full reproducibilty of the data (42 for original run)
random.seed(20)

quotes_random = {'text': [],
      'speaker_id': [],
      'doc_id': [],
      'year': [],
       }

min_characters = 100

while len(quotes_random['text']) < 5000:
    random_meeting = df_ft.iloc[random.randint(0,len(df_ft))]
    text = random_meeting['text'].split("\n")
    facilitator = re.search("TALER \d+:", text[0])
    #print(facilitator[0])


    quotes_per_doc = 30
    c = 0
    while c < quotes_per_doc:
        quote = text[random.randint(0,len(text)-1)]
        if (not facilitator[0] in quote) & (len(quote) > min_characters):
            speaker_id, quote = quote.split(": ", 1)[0:2]
            doc_id = random_meeting['doc_id']
            
            quotes_random['text'].append(quote)
            quotes_random['speaker_id'].append(speaker_id)
            quotes_random['doc_id'].append(doc_id)
            quotes_random['year'].append(re.search(r'\d{4}', doc_id)[0])
            
            c += 1
            # print(doc_id), print(speaker_id), print(text)
            
    print(len(quotes_random['speaker_id']))


In [None]:
random_quotes_sample = pd.DataFrame(quotes_random)

random_quotes_sample

In [54]:
from nltk import sent_tokenize 

def split_by_sentence(df, num_sents):

    sentences = {'text': [],
      'speaker_id': [],
      'doc_id': [],
      'year': [],
       }

    for index, row in df.iterrows():
        sent_per_quote = sent_tokenize(row['text'], "danish")
        
        for i in range(len(sent_per_quote)):
          quote = ""
          while i % num_sents != 0:
            quote =+ sent_per_quote[i]+" "
          
          sentences['text'].append(quote)
          sentences['speaker_id'].append(row.speaker_id)
          sentences['doc_id'].append(row.doc_id)
          sentences['year'].append(row.year)
          
        # for sent in sent_per_quote:
        #   sentences['text'].append(sent)
        #   sentences['speaker_id'].append(row.speaker_id)
        #   sentences['doc_id'].append(row.doc_id)
        #   sentences['year'].append(row.year)
    
    return sentences

In [55]:
test = split_by_sentence(full_quotes_climate_sample, 3)

test

TypeError: bad operand type for unary +: 'str'

In [16]:
from nltk import sent_tokenize

sent_breakdown = {'text': [],
      'speaker_id': [],
      'doc_id': [],
      'year': [],
       }

#Change this to get a different quote sample (random vs. cliamte regexed)
full_quotes = full_quotes_climate_sample

for index, row in full_quotes.iterrows():
    
    full_stops = re.compile("\.|\?|\!")
    sents = re.split(full_stops, row.text)
    
    number_of_sentences = 3
    
    previous_index = 0
    for i in range(0,len(sents), number_of_sentences):
        sent_breakdown['text'].append(".".join(sents[previous_index:i]).strip()+".")
        sent_breakdown['speaker_id'].append(row.speaker_id)
        sent_breakdown['doc_id'].append(row.doc_id)
        sent_breakdown['year'].append(row.year)
        
        previous_index = i
        if i + 2 >= len(sents):
            sent_breakdown['text'][-1] += ".".join(sents[i:-1])
        elif i + 3 >= len(sents):
            sent_breakdown['text'].append(".".join(sents[i:-1]).strip()+".")
            sent_breakdown['speaker_id'].append(row.speaker_id)
            sent_breakdown['doc_id'].append(row.doc_id)
            sent_breakdown['year'].append(row.year)
    

In [None]:
from nltk import sent_tokenize

sent_breakdown = {'text': [],
      'speaker_id': [],
      'doc_id': [],
      'year': [],
       }

#Change this to get a different quote sample (random vs. cliamte regexed)
full_quotes = full_quotes_climate_sample

for index, row in full_quotes.iterrows():
   
    sents =
     
    number_of_sentences = 3
    
    previous_index = 0
    for i in range(0,len(sents), number_of_sentences):
        sent_breakdown['text'].append(".".join(sents[previous_index:i]).strip()+".")
        sent_breakdown['speaker_id'].append(row.speaker_id)
        sent_breakdown['doc_id'].append(row.doc_id)
        sent_breakdown['year'].append(row.year)
        
        previous_index = i
        if i + 2 >= len(sents):
            sent_breakdown['text'][-1] += ".".join(sents[i:-1])
        elif i + 3 >= len(sents):
            sent_breakdown['text'].append(".".join(sents[i:-1]).strip()+".")
            sent_breakdown['speaker_id'].append(row.speaker_id)
            sent_breakdown['doc_id'].append(row.doc_id)
            sent_breakdown['year'].append(row.year)
    

In [None]:
len(sent_breakdown['text'])

len(sent_breakdown['text'])

In [44]:
three_sentence = pd.DataFrame(sent_breakdown)

three_sentence = three_sentence[three_sentence['text'] != '.']
three_sentence.reset_index().drop(columns='index')

#added climate sample
climate_sample2 = three_sentence.sample(n=8000, random_state=42)

climate_sample2.reset_index(drop=True, inplace=True)

climate_sample2

Unnamed: 0,text,speaker_id,doc_id,year
0,"Nej, det mener vi ikke. Men man bruger i dag, ...",TALER 179,ft_20191M3,2019
1,"Målet for vedvarende energi er et mål, som oft...",TALER 75,ft_20101M45,2010
2,"000 statslige arbejdspladser, som regeringen a...",TALER 153,ft_20161M112,2016
3,"Andre lande kigger til os og ser på, hvordan v...",TALER 73,ft_20121M97,2012
4,Regering planlægger derfor i 2015 at fortsætte...,TALER 540,ft_20141M47,2014
...,...,...,...,...
7995,Det er ikke noget problem at lave en meget amb...,TALER 73,ft_20091M43,2009
7996,"Ser ordføreren ikke, at der er et problem i de...",TALER 483,ft_20191M32,2019
7997,"Der er heller ikke i sig selv tiltag, som kan ...",TALER 317,ft_20191M3,2019
7998,I de tilfælde har man jo brugt pisken og færre...,TALER 437,ft_20151M26,2015


In [64]:
climate_sample2.to_pickle("climate_sample_v2_270322.pkl")

#pd.to_pickle(three_sentence, "3sentence_climate_regex_quotes")

#pd.to_pickle(three_sentence, "3sentence_random_quotes")

### Merging the two dataframes

Then we merge the two dfs to create the final dataset that we intend to label and use for fine-tune our binary classifier algorithm.


In [None]:
random = pd.read_pickle("3sentence_random_quotes")
climate = pd.read_pickle("3sentence_climate_regex_quotes")

In [None]:
frames = [climate[0:500], random[0:300]]

climate_and_random = pd.concat(frames)

In [None]:
climate_and_random = climate_and_random.reset_index(drop=True)

In [None]:
climate_and_random.sample(frac=1).reset_index(drop=True).to_csv("800_3sent_unlabeled.csv", encoding='UTF-8')

### Adding more data --> Expanding the dataset for the training data of KlimaBERT

Below some more positives are labeled. From our first approach we would get a larger sample of negatives. This adds 180 positives (climate related quotes to our sample)

In [160]:

for i in range(2190,2200):
    print(i)
    print(climate_sample2['text'][i])
    print("")


2190
Så hvis det er sådan, at De Konservative har lidt højere ambitioner end regeringen, så er vi i Enhedslisten meget åbne over for en god dialog om, hvordan det kan udmønte sig i konkret handling. Og hvis vi skulle komme frem til noget om, at vi skal yde en mere aktiv indsats i EU for at få flere lande til at gå foran, så synes jeg, det kunne være spændende. Ordføreren fremhævede jo selv, at godt nok var der lavet nogle aftaler om mål på 1,5 graders global opvarmning, men at det, der er meldt ind, jo er noget, der ender med, at det nok bliver sådan 4-5 grader.

2191
Og ved at Alternativet går med, kan der blive et flertal for det. Jeg vil da godt høre, om det ikke ville være mere rimeligt, at Dansk Folkeparti stemte for det her forslag, når nu ordføreren siger, at man sådan set er kommet frem til, at vi når målsætningen om 40 procent CO 2 -reduktion. Så er det vel ikke så farligt, selv om De Konservative er med.

2192
Til allersidst mener vi også, der er behov for, at man kigger på e

In [161]:
index_lst = [1, 3, 4, 5, 6, 15, 18, 20, 21, 23, 25, 28, 30, 33, 35, 37, 39, 41, 46, 47, 49, 50, 52, 53, 56, 61, 64, 71, 72, 73, 74, 75, 82, 84, 86, 87, 91, 92, 109, 110, 112, 113, 116, 132, 134, 135, 137, 138, 139,
             140, 143, 144, 145, 151, 156, 159, 162, 171, 172, 185, 186, 203, 209, 201, 217, 219, 220, 227, 235, 236, 241, 264, 266, 285, 286, 287, 289, 303, 304, 306, 313, 314, 315, 316, 319, 320, 323, 325, 339, 351,
             357, 360, 362, 363, 369, 371, 399, 403, 406, 410, 411, 413, 425, 429, 432, 434, 437, 439, 441, 443, 445, 446, 450, 451, 453, 459, 467, 467, 472, 473, 475, 481, 489, 494, 500, 503, 506, 507, 509, 522, 527,
             532, 544, 551, 552, 555, 563, 580, 581, 582, 597, 606, 609, 610, 617, 621, 2040, 2047, 2049, 2052, 2056, 2059, 2064, 2065, 2066, 2067, 2074, 2077, 2079, 2083, 2085, 2089, 2093, 2094, 2095, 2096, 2097, 2100,
             2102, 2103, 2122, 2123, 2123, 2126, 2133, 2135, 2137, 2139, 2151, 2172, 2189, 2196]

len(index_lst)

182

In [70]:
climate_sample2['text'][119]

'Vi kan skabe vækst i den velfærd, der kommer os alle sammen til gode, i stedet for at sænke skatten for dem, der har mest i forvejen. Vi har alle muligheder for at skabe et samfund, der i højere grad er rigt på den måde, vi behandler hinanden på, rigt på de muligheder, vi giver til hinanden og til vores børn. Vi skal fordele ressourcerne mere retfærdigt og smartere, ikke mindst.'

In [162]:
for i in index_lst:
    climate_sample2.at[i, 'label'] = 1

climate_sample_positives = climate_sample2[climate_sample2.label == 1.0]
climate_sample_positives['label'] =climate_sample_positives['label'].apply(lambda x: int(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [165]:
climate_sample_positives[['text', 'label']].to_pickle("180climate_related_270322.pkl")