# Basic imports

In [None]:
import re
import pdfplumber
from spacy.lang.nb import Norwegian
import spacy

# Helper functions

In [None]:
def pdfToText(fileName):
    all_text = ""
    with pdfplumber.open(fileName) as pdf:
        for page in pdf.pages:
            all_text += page.extract_text()
    # print(all_text)
    all_text = re.sub('\s', ' ', all_text)
    # print(all_text)
    return all_text

regPlan = pdfToText('regional-planstrategi-2016-2010.pdf')
kirkepol = pdfToText('050200-sak-kommunens-kirkepolitikk.pdf')
smabathavn = pdfToText('kommunedelplan-for-smabathavner-2007-2017.pdf')
kultur = pdfToText('strategiplan-kultur-web.pdf')
havbruk = pdfToText("havbruk.pdf")

In [184]:
def txtToStr(filename):
    f = open(filename, 'r')
    textLines = f.readlines()
    text = ""
    for line in textLines:
        text += line
    text = re.sub('\s', ' ', text)
    return text

brautTxt = txtToStr("braut.txt")
byplanWiki = txtToStr("byplanleggingWiki.txt")

# Scraping

No need to run if "sdg#.txt" in "sdgs" folder is present.

In [None]:
# Scraping imports and inits
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
from bs4 import BeautifulSoup as BS

In [None]:
# Scraping sdgs from FN.
sdgs = []
driver.get("https://www.fn.no/om-fn/fns-baerekraftsmaal")
soup = BS(driver.page_source, features="html.parser")
title_cards = soup.find_all(class_="header_gols_content_item")
for card in title_cards:
    a = card.find("a", href=True)
    sdgs.append(a["href"])
print(sdgs)
i = 1
for sdg in sdgs:
    driver.get(f"https://www.fn.no/{sdg}")
    soup = BS(driver.page_source, features="html.parser")
    paragraphs = soup.find_all("p")
    f = open(f'sdgs/sdg{i}.txt', 'w')
    for par in paragraphs:
        line = par.text.strip()
        line = line.replace('...', '')
        line = line.replace('&aelig;', 'æ')
        if len(line) > 0:
            if not line[-1] in ".?!:)":
                line += '.'
            f.write(f'{line}\n')
    f.close()
    i += 1

# Predicting

In [None]:
def similarityText(mainStr, searchStr):
    """
    Return spacy similatiry based on vector in nb_core_news_lg.
    """
    nlp = spacy.load("nb_core_news_lg")

    mainDoc = nlp(mainStr)
    searchDoc = nlp(searchStr)

    mainTokenized = nlp(' '.join([str(token.lemma_) for token in mainDoc if not token.is_stop and not token.is_punct and not token.is_space]))
    searchTokenized = nlp(' '.join([str(token.lemma_) for token in searchDoc if not token.is_stop and not token.is_punct and not token.is_space]))
    # print(mainTokenized)
    # print(searchTokenized)
    return mainTokenized.similarity(searchTokenized)

In [None]:
def sdgSimilatiry(string):
    """
    Print and return similarity of string to all SDGs.
    """
    valueList = []
    for sdg in range(17):
        value = similarityText(string, txtToStr(f"sdgs/sdg{sdg+1}.txt"))
        print(f"SDG #{sdg+1} has this similarity to your string: {value}")
        valueList.append(value)
    return valueList

In [None]:
sdgVector = {} # Container for similarity results

In [None]:
def addSdgVector(str, name):
    """
    Add similarity results to sdgVector.
    """
    sdgVector[name] = sdgSimilatiry(str)

In [None]:
addSdgVector(regPlan, "Regional planstrategi")
addSdgVector(smabathavn, "Kommunedelplan for småbåthavner")
addSdgVector(kultur, "Strategiplan kultur")

In [None]:
# for documentation see: https://www.kaggle.com/satishgunjal/tutorial-text-classification-using-spacy
import string
nlp = spacy.load("nb_core_news_lg")
parser = Norwegian()
punctuations = string.punctuation
stop_words = spacy.lang.nb.stop_words.STOP_WORDS
def spacy_tokenizer(sentence):
    """This function will accepts a sentence as input and processes the sentence into tokens, performing lemmatization, 
    lowercasing, removing stop words and punctuations."""
    
    # Creating our token object which is used to create documents with linguistic annotations
    mytokens = nlp(sentence)
    
    # lemmatizing each token and converting each token in lower case
    # Note that spaCy uses '-PRON-' as lemma for all personal pronouns lkike me, I etc
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    
    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations]
    # Return preprocessed list of tokens
    return mytokens  

In [None]:
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [None]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        """Override the transform method to clean text"""
        return [clean_text(text) for text in X]
    
    def fit(self, X, y= None, **fit_params):
        return self
    
    def get_params(self, deep= True):
        return {}

# Basic function to clean the text
def clean_text(text):
    """Removing spaces and converting the text into lowercase"""
    return text.strip().lower()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfVector = TfidfVectorizer(tokenizer=spacy_tokenizer)

In [169]:
# Creating trainingdata for classifying SDGs
X_train = []
y_train = []
for i in range(17):
    f = open(f'sdgs/sdg{i+1}.txt')
    for line in f:
        line = re.sub('\s', ' ', line)
        X_train.append(line)
        y_train.append(i+1)

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(multi_class='ovr', solver='liblinear')

# Create pipeline using Tf-idf
pipe = Pipeline ([("cleaner", predictors()),
                 ("vectorizer", tfidfVector),
                 ("classifier", classifier)])
                 

In [170]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7fb992a34690>),
                ('vectorizer',
                 TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x7fb98da884d0>)),
                ('classifier',
                 LogisticRegression(multi_class='ovr', solver='liblinear'))])

In [None]:
X_test = [kirkepol, regPlan, kultur]

In [174]:
def tfidfModel(testData):
    """
    Return predicted probabilities for Pipeline (pipe).
    """
    predicted = pipe.predict_proba(testData)
    return predicted

tfidfModel(X_test)

[[0.05007421 0.05547551 0.05961166 0.04732276 0.06221764 0.05157153
  0.05782691 0.05784505 0.05378199 0.07203965 0.07491142 0.05713694
  0.04850368 0.0548395  0.06136086 0.06523781 0.07024289]
 [0.04705317 0.05632377 0.0566746  0.05294855 0.05177795 0.0450207
  0.05609791 0.05307658 0.07936843 0.05567265 0.06739587 0.06689712
  0.05181272 0.05858311 0.05592472 0.05140032 0.09397184]
 [0.04928288 0.05188751 0.05563631 0.04871648 0.04493027 0.04819156
  0.04458906 0.06354457 0.08569743 0.058139   0.06599206 0.06008801
  0.04411382 0.05527874 0.0655141  0.06703287 0.09136534]]


array([[0.05007421, 0.05547551, 0.05961166, 0.04732276, 0.06221764,
        0.05157153, 0.05782691, 0.05784505, 0.05378199, 0.07203965,
        0.07491142, 0.05713694, 0.04850368, 0.0548395 , 0.06136086,
        0.06523781, 0.07024289],
       [0.04705317, 0.05632377, 0.0566746 , 0.05294855, 0.05177795,
        0.0450207 , 0.05609791, 0.05307658, 0.07936843, 0.05567265,
        0.06739587, 0.06689712, 0.05181272, 0.05858311, 0.05592472,
        0.05140032, 0.09397184],
       [0.04928288, 0.05188751, 0.05563631, 0.04871648, 0.04493027,
        0.04819156, 0.04458906, 0.06354457, 0.08569743, 0.058139  ,
        0.06599206, 0.06008801, 0.04411382, 0.05527874, 0.0655141 ,
        0.06703287, 0.09136534]])

In [185]:
tfidfModel([byplanWiki])

[[0.03982087 0.05026226 0.06192469 0.05234734 0.05295878 0.05419719
  0.07524212 0.03971253 0.05953944 0.06374593 0.06831475 0.04775197
  0.05092737 0.06188385 0.06360098 0.07821822 0.07955172]]


array([[0.03982087, 0.05026226, 0.06192469, 0.05234734, 0.05295878,
        0.05419719, 0.07524212, 0.03971253, 0.05953944, 0.06374593,
        0.06831475, 0.04775197, 0.05092737, 0.06188385, 0.06360098,
        0.07821822, 0.07955172]])