In [1]:
# standard library
from typing import List

#data wrangling
import pandas as pd
import numpy as np

# nlp
import spacy

# data modelling
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# utils
from tqdm import tqdm

# visualisation
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('https://zenodo.org/record/5550238/files/osdg-community-dataset-v21-09-30.csv?download=1')
print('Shape:', df.shape)
display(df.head())

In [3]:
df.loc[df.sdg == 13].sample(5)[['text', 'sdg']]

In [4]:
#unique classes in sdg
sorted(df["sdg"].unique())

In [5]:
df["sdg"] = df["sdg"] -1

In [6]:
# Get the lists of sentences and their labels.
sentences = df.text.values
labels = df.sdg.values

In [7]:
# calculating cumulative probability over agreement scores
df_lambda = df['agreement'].value_counts(normalize = True).sort_index().cumsum().to_frame(name = 'p_sum')
df_lambda.reset_index(inplace = True)
df_lambda.rename({'index': 'agreement'}, axis = 1, inplace = True)

print('Shape:', df_lambda.shape)
display(df_lambda.head())

In [8]:
import plotly.express as px
import plotly.io as pio

fig = px.line(
    data_frame = df_lambda,
    x = 'agreement',
    y = 'p_sum',
    markers = True,
    labels = {
        'agreement': 'Agreement Score',
        'p_sum': 'Cumulative Probrability'
    },
    color_discrete_sequence = ['#1f77b4'],
    title = 'Figure 1. Cumulative Distribution Function of the Agreement Score'
)

fig.update_traces(hovertemplate = 'Agreement score: %{x:.2f}<br>Cumulative probability: %{y:.2f}')
fig.update_layout(
    xaxis = {'dtick': 0.1},
    yaxis = {'dtick': 0.25}
)
fig.show()

In [9]:
# keeping only the texts whose suggested sdg labels is accepted and the agreement score is at least .6
print('Shape before:', df.shape)
df_osdg = df.query('agreement >= .6 and labels_positive > labels_negative').copy()
print('Shape after :', df_osdg.shape)
display(df_osdg.head())

In [10]:
df_lambda = df_osdg.groupby('sdg', as_index = False).agg(count = ('text_id', 'count'))
df_lambda['share'] = df_lambda['count'].divide(df_lambda['count'].sum()).multiply(100)
print('Shape:', df_lambda.shape)
display(df_lambda)

In [11]:
fig = px.bar(
    data_frame = df_lambda,
    x = 'sdg',
    y = 'count',
    custom_data = ['share'],
    labels = {
        'sdg': 'SDG',
        'count': 'Count'
    },
    color_discrete_sequence = ['#1f77b4'],
    title = 'Figure 2. Distribution of Texts (Agreement >.6) over SDGs'
)

fig.update_traces(hovertemplate = 'SDG %{x}<br>Count: %{y}<br>Share: %{customdata:.2f}%')
fig.update_layout(xaxis = {'type': 'category'})
fig.show()

In [12]:
def plot_confusion_matrix(y_true: np.ndarray, y_hat: np.ndarray, figsize = (16, 9)):
    """
    Convenience function to display a confusion matrix in a graph.
    """
    labels = sorted(list(set(y_true)))
    df_lambda = pd.DataFrame(
        confusion_matrix(y_true, y_hat),
        index = labels,
        columns = labels
    )
    acc = accuracy_score(y_true, y_hat)
    f1s = f1_score(y_true, y_hat, average = 'weighted')

    fig, ax = plt.subplots(figsize = figsize)
    sns.heatmap(
        df_lambda, annot = True, square = True, cbar = False,
        fmt = 'd', linewidths = .5, cmap = 'YlGnBu',
        ax = ax
    )
    ax.set(
        title = f'Accuracy: {acc:.2f}, F1 (weighted): {f1s:.2f}',
        xlabel = 'Predicted',
        ylabel = 'Actual'
    )
    fig.suptitle('Confusion Matrix')
    plt.tight_layout()

In [13]:
# other settings
pio.templates.default = 'plotly_white'

spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm', disable = ['ner'])
print('Disabled spaCy components:', nlp.disabled)
print('SpaCy version:', spacy.__version__)

In [14]:
def preprocess_spacy(alpha: List[str]) -> List[str]:
    """
    Preprocess text input using spaCy.
    
    Parameters
    ----------
    alpha: List[str]
        a text corpus.
    
    Returns
    -------
    doc: List[str]
        a cleaned version of the original text corpus.
    """
    docs = list()
    
    for doc in tqdm(nlp.pipe(alpha, batch_size = 128)):
        tokens = list()
        for token in doc:
            if token.pos_ in ['NOUN', 'VERB', 'ADJ']:
                tokens.append(token.lemma_)
        docs.append(' '.join(tokens))
        
    return docs

In [15]:
df_osdg['docs'] = preprocess_spacy(df_osdg['text'].values)
print('Shape:', df_osdg.shape)
display(df_osdg.head())

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    df_osdg['docs'].values, 
    df_osdg['sdg'].values, 
    test_size = .3,
    random_state = 42
)

print('Shape train:', X_train.shape)
print('Shape test:', X_test.shape)

In [17]:
pipe = Pipeline([
    ('vectoriser', TfidfVectorizer(
        ngram_range = (1, 2),
        max_df = 0.75,
        min_df = 2,
        max_features = 100_000
    )),
    ('selector', SelectKBest(f_classif, k = 5_000)),
    ('clf', LogisticRegression(
        penalty = 'l2',
        C = .9,
        multi_class = 'multinomial',
        class_weight = 'balanced',
        random_state = 42,
        solver = 'newton-cg',
        max_iter = 100
    ))
])

pipe.fit(X_train, y_train)

y_hat = pipe.predict(X_test)
plot_confusion_matrix(y_test, y_hat)

print(classification_report(y_test, y_hat, zero_division = 0))

In [18]:
from sklearn.naive_bayes import MultinomialNB

pipe = Pipeline([
    ('vectoriser', TfidfVectorizer(
        ngram_range = (1, 2),
        max_df = 0.75,
        min_df = 2,
        max_features = 100_000
    )),
    ('selector', SelectKBest(f_classif, k = 5_000)),
    ('clf', MultinomialNB()
    )
])

pipe.fit(X_train, y_train)

y_hat = pipe.predict(X_test)
plot_confusion_matrix(y_test, y_hat)

print(classification_report(y_test, y_hat, zero_division = 0))

In [19]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer

pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('selector', SelectKBest(f_classif, k = 5_000)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))
    
])

pipe.fit(X_train, y_train)

y_hat = pipe.predict(X_test)
plot_confusion_matrix(y_test, y_hat)

print(classification_report(y_test, y_hat, zero_division = 0))

In [20]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer

pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('selector', SelectKBest(f_classif, k = 5_000)),
    ('clf', MultinomialNB())
    
])

pipe.fit(X_train, y_train)

y_hat = pipe.predict(X_test)
plot_confusion_matrix(y_test, y_hat)

print(classification_report(y_test, y_hat, zero_division = 0))

In [21]:
pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('selector', SelectKBest(f_classif, k = 5_000)),
    ('clf', LogisticRegression(
        penalty = 'l2',
        C = .9,
        multi_class = 'multinomial',
        class_weight = 'balanced',
        random_state = 42,
        solver = 'newton-cg',
        max_iter = 100
    ))
])

pipe.fit(X_train, y_train)

y_hat = pipe.predict(X_test)
plot_confusion_matrix(y_test, y_hat)

print(classification_report(y_test, y_hat, zero_division = 0))

In [22]:
new_df = pd.DataFrame(columns=['text'])

s = "The average figure also masks large differences across regions in Kazakhstan. The number of annual contacts ranges from 2.0 in Astana to 9.7 in Mangystau, and some parts of the population are likely to have very limited access to primary care. In addition, poor coverage of outpatient prescription medicines limits both the effectiveness (and appeal) of care at PHC level."

new_df.loc[len(new_df.index)] = s

new_df['docs'] = preprocess_spacy(new_df['text'].values)

In [23]:
new_df

In [24]:
pipe.predict(new_df['text'])

In [25]:
df['text'][2]

In [26]:
df['sdg'][2]

In [27]:
import pickle

# save the model to disk
filename = 'final_model.sav'
pickle.dump(pipe, open(filename, 'wb'))

In [28]:
pwd

In [35]:
probs = pipe.predict_proba(X_test)
print(probs)

In [30]:
np.where(max(probs[25]))

In [32]:
def fix_sdg_name(sdg):
    sdg_id2name = {
        1: 'GOAL 1: No Poverty',
        2: 'GOAL 2: Zero Hunger',
        3: 'GOAL 3: Good Health and Well-being',
        4: 'GOAL 4: Quality Education',
        5: 'GOAL 5: Gender Equality',
        6: 'GOAL 6: Clean Water and Sanitation',
        7: 'GOAL 7: Affordable and Clean Energy',
        8: 'GOAL 8: Decent Work and Economic Growth',
        9: 'GOAL 9: Industry, Innovation and Infrastructure',
        10: 'GOAL 10: Reduced Inequality',
        11: 'GOAL 11: Sustainable Cities and Communities',
        12: 'GOAL 12: Responsible Consumption and Production',
        13: 'GOAL 13: Climate Action',
        14: 'GOAL 14: Life Below Water',
        15: 'GOAL 15: Life on Land',
        16: 'GOAL 16: Peace and Justice Strong Institutions',
        17: 'GOAL 17: Partnerships to achieve the Goal'
    }

    name = sdg_id2name[int(sdg)]
    return name

In [33]:
print(fix_sdg_name(1))

In [None]:
import pdfplumber

def extract_data(feed):
    data = []
    with pdfplumber.load(feed) as pdf:
        pages = pdf.pages
        for p in pages:
            data.append(p.extract_text())
        text = ' '.join(data)
    return text # build more code to return a dataframe

