In [1]:
'''basics'''
import pandas as pd
import transformers
print('transformers: {}'.format(transformers.__version__))

'''visualisations'''
import plotly.express as px

'''set path'''
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join('..', 'src')))

'''import helper functions'''
import clean as clean



  from .autonotebook import tqdm as notebook_tqdm


transformers: 4.24.0


In [10]:
df= pd.read_csv('../data/raw/osdg-community-data-v2023-10-01.csv', sep='\\t', engine='python')
#df= pd.read_csv('../data/raw/osdg-community-data-v2023-07-01.csv', sep='\\t', engine='python')
#df= pd.read_csv('../data/raw/osdg-community-data-v2023-01-01.csv', sep='\\t', engine='python')
#df= pd.read_csv('../data/raw/osdg-community-data-v2022-10-01.csv', sep='\\t', engine='python')
#df= pd.read_csv('../data/raw/osdg-community-data-v2022-07-01.csv',sep='\t')
#df= pd.read_csv('../data/raw/osdg-community-dataset-v2022-04-01.csv',sep='\t')

"""
only uncomment if using 2023-01-01 data to take care of typos in column names:
"""
# df.columns = df.columns.str.strip('\"')
# df["doi"] = df["doi"].str.strip('\"')
# df["agreement"] = df["agreement"].str.strip('\"')
# df["agreement"] = df["agreement"].astype(float)


print(df.shape)
df.head(5)

(42355, 7)


Unnamed: 0,doi,text_id,text,sdg,labels_negative,labels_positive,agreement
0,10.6027/9789289342698-7-en,00021941702cd84171ff33962197ca1f,"""From a gender perspective, Paulgaard points o...",5,1,8,0.777778
1,10.18356/eca72908-en,00028349a7f9b2485ff344ae44ccfd6b,Labour legislation regulates maximum working h...,11,2,1,0.333333
2,10.1787/9789264289062-4-en,0004eb64f96e1620cd852603d9cbe4d4,The average figure also masks large difference...,3,1,8,0.777778
3,10.1787/3726edff-en,0005d3e8b213d9e2cb967666e1aca2e9,Applied research is directed “primarily toward...,9,3,6,0.333333
4,10.1787/5k9b7bn5qzvd-en,0006a887475ccfa5a7f5f51d4ac83d02,The extent to which they are akin to corruptio...,3,1,2,0.333333


In [5]:
print('average text length: ', round(df.text.str.split().str.len().mean()))
print('stdev text length: ', round(df.text.str.split().str.len().std()))
print('max text length: ', round(df.text.str.split().str.len().max()))

if df.text.str.split().str.len().mean() < 300:
    print("suitable for standard transformer models!")

average text length:  94
stdev text length:  31
max text length:  226
suitable for standard transformer models!


In [6]:
'''plot sdg distribution'''
fig = px.histogram(df, x="sdg",  nbins=35, title="SDG Distribution")
fig.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 1,
        dtick = 1
    )
)
fig.show()

print("not evenly distributed - check when training multilabel/mutliclass classifier")

print("We see that only 16 SDGs are included. SDG 17 seem to be missing. It is an overarching SDG - so it is understandable that annotating for it is difficult")

not evenly distributed - check when training multilabel/mutliclass classifier
We see that only 16 SDGs are included. SDG 17 seem to be missing. It is an overarching SDG - so it is understandable that annotating for it is difficult


In [7]:
'''plot share of positive/negative labels per sdgs to visualise agreement of annotators'''
count_sdg = df[['sdg', 'labels_negative', 'labels_positive']].groupby('sdg', as_index=False).sum()
fig = px.bar(count_sdg, x="sdg", y=["labels_negative", "labels_positive"], title="Annotation Distribution")
fig.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 1,
        dtick = 1
    )
)
fig.show()

print("agreement looks are okayish...")


agreement looks are okayish...


In [17]:
# keeping only the texts whose suggested sdg labels is accepted and the agreement score is at least .75
# We want to have pretty high confidence in our labels and thus drop almost 50% of the training data. Quality > Quantity
print('Shape before:', df.shape)
df_cut = df.query('agreement >= .7 and labels_positive > labels_negative').copy()
print('Shape after :', df_cut.shape)
display(df_cut.head())

Shape before: (42355, 7)
Shape after : (20545, 7)


Unnamed: 0,doi,text_id,text,sdg,labels_negative,labels_positive,agreement
0,10.6027/9789289342698-7-en,00021941702cd84171ff33962197ca1f,"""From a gender perspective, Paulgaard points o...",5,1,8,0.777778
2,10.1787/9789264289062-4-en,0004eb64f96e1620cd852603d9cbe4d4,The average figure also masks large difference...,3,1,8,0.777778
8,10.1787/9789264117563-8-en,000bfb17e9f3a00d4515ab59c5c487e7,The Israel Oceanographic and Limnological Rese...,6,0,3,1.0
9,10.18356/805b1ae4-en,001180f5dd9a821e651ed51e30d0cf8c,Previous chapters have discussed ways to make ...,2,0,3,1.0
11,10.1017/S0008423907070424,00162fc8346ca9cd525d8f87ac2b5352,The “War on Terror” and the Framework of Inter...,16,0,7,1.0


In [18]:
#aggregate 
df_lambda = df_cut.groupby('sdg', as_index = False).agg(count = ('text_id', 'count'))
df_lambda['share'] = df_lambda['count'].divide(df_lambda['count'].sum()).multiply(100)
print('Shape:', df_lambda.shape)
display(df_lambda)

Shape: (16, 3)


Unnamed: 0,sdg,count,share
0,1,1131,5.504989
1,2,760,3.699197
2,3,1788,8.702847
3,4,2261,11.005111
4,5,2337,11.37503
5,6,1286,6.259431
6,7,1775,8.639572
7,8,827,4.02531
8,9,1251,6.089073
9,10,783,3.811146


In [19]:
fig = px.bar(
    data_frame = df_lambda,
    x = 'sdg',
    y = 'count',
    custom_data = ['share'],
    labels = {
        'sdg': 'SDG',
        'count': 'Count'
    },
    color_discrete_sequence = ['#1f77b4'],
    title = 'Figure 2. Distribution of Texts (Agreement > 70%) over SDGs'
)

fig.update_traces(hovertemplate = 'SDG %{x}<br>Count: %{y}<br>Share: %{customdata:.2f}%')
fig.update_layout(xaxis = {'type': 'category'})
fig.show()

In [None]:
'''process and clean text'''
columns_to_clean = ['text']

for column in columns_to_clean:
    df[column] = df[column].astype(str)
    
    #basic cleaning
    new = column + "_clean"
    print(new)
    df[new] = df[column].apply(clean.basic)
    
    #lemmatise and stemming + basic cleaning
    # new_spacy = column + "_clean_spacy"
    # print(new_spacy)
    # df[new_spacy] = df[column].apply(clean.spacy_clean)

In [21]:
'''store processed data'''
df_cut.to_csv(os.path.abspath(os.path.join('..', 'data/processed'))+'/data_processed.csv', index=False)

In [22]:
print(df_cut.text.iloc[0])
print('_____________')
print(df_cut.text_clean.iloc[0])
print('_____________')
# print(df.text_clean_spacy.iloc[0])

"From a gender perspective, Paulgaard points out that the labour markets of the fishing villages have been highly gender-segregated in terms of the existence of ""male jobs"" and ""female jobs"", however, the new business opportunities have led to the male population of the peripheral areas now working in the service industry in former ""female jobs"": ""That boys and girls are doing the same jobs indicates change, because traditional boundaries between women and men's work are being crossed. But the fact that young people are still working represents continuity with the past"" (Paulgaard 2002: 102). When Paulgaard refers to continuity with traditions, she refers to the expectations of young adults to participate in adult culture, thus these fishing villages traditionally have no actual youth culture. As described earlier, Paulgaard (2015) concludes that in some of Norway's peripheral areas school is still 'foreign', a time waster stealing time from young adults who should instead spen

AttributeError: 'DataFrame' object has no attribute 'text_clean'

In [23]:
'''check if there are any duplicates'''
df_cut[df_cut.duplicated(['text'], keep=False)].sort_values(by='text')

Unnamed: 0,doi,text_id,text,sdg,labels_negative,labels_positive,agreement


In [24]:
'''transform for hugginface trainer'''
df_alpha = df_cut[["text_clean", "sdg"]]
df_alpha = df_alpha.rename(columns={'sdg': 'target'})

df_alpha.to_csv(os.path.abspath(os.path.join('..', 'data/processed'))+'/data_transformer.csv', index=False)

KeyError: "['text_clean'] not in index"

In [25]:
'''transform for Pytorch training'''

df_alpha = df_cut[["text_clean", "sdg"]]
df_alpha = df_alpha.rename(columns={'sdg': 'label'})
df_alpha = df_alpha.rename(columns={'text_clean': 'sentence1'})

# split train valid:
from sklearn.model_selection import train_test_split
'''split test train'''
train, valid = train_test_split(df_alpha, test_size=0.2, stratify=df['sdg'].values, random_state = 1, shuffle = True)

print('train size:', len(train))
print('test size:', len(valid))

#assert len(set(y_train)) == len(set(y_test)), "Unequal splits with some labels missing"

train.to_csv(os.path.abspath(os.path.join('..', 'data/processed'))+'/train.csv', index=False)
valid.to_csv(os.path.abspath(os.path.join('..', 'data/processed'))+'/valid.csv', index=False)


KeyError: "['text_clean'] not in index"