In [591]:
from dataclasses import dataclass, asdict
from tqdm import tqdm
import os
import time
import re
import pandas as pd
import numpy as np
from uuid import uuid4
import tiktoken
from more_itertools import chunked

from openai.embeddings_utils import get_embedding, cosine_similarity
import openai

In [576]:
def find_text_files(directory):
    """
    This function recursively finds all text files in a directory and its subdirectories.
    """
    text_files = []
    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)
        if os.path.isfile(item_path) and item_path.endswith('.txt'):
            text_files.append(item_path)
        elif os.path.isdir(item_path):
            text_files += find_text_files(item_path)
    return text_files



In [577]:
# Call the function with the root directory
root_directory = 'data/website/'
all_text_files = find_text_files(root_directory)

# Print the list of all text files found
print(len(all_text_files))
print(all_text_files[:4])

5706
['data/website/www.stavanger.kommune.no/etablering-av-dalaneveien-miljogate/index.html.txt', 'data/website/www.stavanger.kommune.no/fyrverkeri-i-stavanger/index.html.txt', 'data/website/www.stavanger.kommune.no/klimastavanger/nytt-om-klima-og-miljo/5-millioner-i-eu-midler/index.html.txt', 'data/website/www.stavanger.kommune.no/klimastavanger/nytt-om-klima-og-miljo/pant-eller-gebyr-for-piggdekkeiere/index.html.txt']


In [578]:
index = 10
with open(all_text_files[index], 'rb') as f:
    #article = f.read().decode("iso-8859-1")
    article = f.read().decode("utf-8")

print(all_text_files[index])


data/website/www.stavanger.kommune.no/nn/klimastavanger/nytt-om-klima-og-miljo/pant-eller-gebyr-for-piggdekkeiere/index.html.txt


In [579]:
# data structure
# 1. document
# 2. header
# 3. paragraph
# 4. author
# 5. date
# 6. text


@dataclass
class Entry:
    def __init__(self, id, address, header, paragraph_number, star_count):
        self.id = id
        self.address = address.replace('data/website/', '').replace('.txt', '')
        self.parent = None
        self.children = []
        self.header = self.remove_backspaces(header)
        self.paragraph_number = paragraph_number
        self.text = self.remove_backspaces(header) + ' - '
        self.author = None
        self.date = None
        self.star_count = star_count
        self.char_count = 0
        self.text_hash = None
        self.embedding = None

    def add_line_to_text(self, line):
        self.text += line + '\n'
    
    def count_chars(self):
        self.char_count = len(self.text)
    
    def add_child(self, child):
        self.children.append(child)
    
    def set_parent(self, parent):
        self.parent = parent
    
    def set_author(self, author):
        self.author = author
    
    def set_date(self, date):
        self.date = date

    def remove_images(self):
        pattern = r'\[data:image/[^]]*?\]'
        self.text = re.sub(pattern, '', self.text)

    def remove_backspaces(self, text):
        return re.sub('.?\x08', '', text)

    def set_embedding(self, embedding):
        self.embedding = embedding
    
    def cleanup(self):
        self.remove_images()
        self.text = self.remove_backspaces(self.text)
        self.count_chars()
        self.text_hash = hash(self.text)

    @property
    def image_count(self):
        return self.text.count('[data:image/')
    
    @property
    def token_count(self):
        """https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them"""
        return int(self.char_count / 4)
    
    @property
    def token_count_2(self):
        """Returns the number of tokens in a text string."""        
        encoding_name = "cl100k_base"
        encoding = tiktoken.get_encoding(encoding_name)
        num_tokens = len(encoding.encode(self.text))
        return num_tokens




In [580]:
def parse(address, article):
    entry = []
    paragraphs_number = 0
    author = ''
    date = ''
    for i, line in enumerate(article.split('\n')):
        if len(line) > 5:
            comp_line = re.sub('.?\x08', '', line)
            if '**' in line:
                start_count = line.count('*')
                paragraphs_number += 1
                entry.append(Entry(uuid4().hex, address, line.replace('*',''), paragraphs_number, start_count))
            elif 'Av :' in comp_line:
                author = comp_line.split(':')[1].strip()
            elif 'Publisert :' in comp_line or 'Oppdatert:' in comp_line:
                date = comp_line.split(':')[1].strip()
            else:
                if entry:
                    entry[-1].add_line_to_text(line)
    
    # summarize article
    last_star_count = 0
    parent_id = None
    for e in entry:
        # if entry start count is higher than last entry, set entry as parent
        if e.star_count > last_star_count:
            parent_id = e.id
        
        if parent_id:
            e.set_parent(parent_id)
        last_star_count = e.star_count

        e.set_author(author)
        e.set_date(date)

        e.cleanup()
    
    return entry

In [581]:

res = []
for path in tqdm(all_text_files):
    with open(path, 'rb') as f:
        #article = f.read().decode("iso-8859-1")
        article = f.read().decode("utf-8")
    res += parse(path, article)

print('number of imgaes in text', sum([e.image_count for e in res]))

100%|██████████| 5706/5706 [00:12<00:00, 460.34it/s] 


number of imgaes in text 0


## Check for duplicates

In [582]:
# check duplicates
print(len(res))
print(len(set([e.text_hash for e in res])))

# count entries with same hash
from collections import Counter
print(Counter([e.text_hash for e in res]).most_common(10))

# check duplicates
#_ = [print(e.text) for e in res if e.text_hash == -3497365673975875986]

# remove duplicates
res_cleared = []
unique_hashes = set()
for e in res:
    if e.text_hash not in unique_hashes:
        res_cleared.append(e)
        unique_hashes.add(e.text_hash)


227048
17823
[(9008245812682478361, 5433), (-3648746030429287278, 5433), (-108987978997381784, 5433), (-4874404133725328492, 5433), (-394359902893909758, 5433), (-6247031336668194210, 5433), (-6933633828786028366, 5433), (-1307531640894142734, 5433), (4701816960105232996, 5433), (-2671904024314125064, 5433)]


## Check costs

In [583]:
print(len(res_cleared), len(res))
# approx
number_tokens = sum([e.token_count for e  in res_cleared if e.token_count < 80000 and e.token_count > 200])/1000
print('sum of k tokens ', number_tokens)
print(f'approximate cost {number_tokens * 0.0004:.2f}$')
# exact
number_tokens = sum([e.token_count_2 for e  in res_cleared if e.token_count_2 < 80000 and e.token_count_2 > 200])/1000
print('sum of k tokens ', number_tokens)
print(f'exact cost {number_tokens * 0.0004:.2f}$')

17823 227048
sum of k tokens  1694.038
approximate cost 0.68$
sum of k tokens  2655.122
exact cost 1.06$


In [584]:
res_cleared.sort(key=lambda x: x.token_count_2, reverse=True)
print(res_cleared[-10].text)

 Contact us - 


In [585]:
print(res_cleared[500].text.replace('\n', ' ').replace('\r', ' ').replace('==','').replace('\t', ' '))

 Utested har fått åpne igjen - Utestedet Hygge i Steinkargata 4 i Stavanger sentrum måtte onsdag 14.10 stenge med umiddelbar virkning. Oppdatert 15. oktober: Hygge har nå fått åpne igjen. Publisert: 14.10.2020 Av: Kommunikasjonsavdelingen  Skriv ut  Del på Facebook  Del på Twitter  Del på LinkedIn  Denne nyheitssaka er meir enn ein månad gammal. Innhaldet kan derfor vera forelda eller ikkje oppdatert. Gå til stavanger.kommune.no/nyheter for siste nytt. – Kontrollørene våre har etter flere tilsyn den siste uken gitt tilbakemeldinger om at smittevernet på Hygge har vært for dårlig. Folk har sittet for tett inne, gjester er observert vandrende i lokalet og det har ikke vært kontroll på køen utenfor, sier smittevernoverlege Runar Johannessen. Smittevernoverlegen i kommunen har derfor vedtatt å stenge utestedet til og med søndag 18. oktober. Smittevernfaglig forsvarlig drift innebærer blant annet at gjester og personell skal kunne holde minst én meters avstand til personer fra andre hushold

## Embedd text and sorte as dataframe

In [586]:

# Set up your API key
openai.api_key = os.environ["OPENAI_API_KEY"]


In [588]:
[(e.header, e.date, e.address) for e in res_cleared if 'Per Erling Ramslands legat' in e.text]

[(' Kultur',
  '',
  'www.stavanger.kommune.no/kultur-og-fritid/kultur/index.html'),
 (' Kultur',
  '',
  'www.stavanger.kommune.no/nn/kultur-og-fritid/kultur/index.html'),
 (' Legater', '', 'www.stavanger.kommune.no/nn/legater/index.html'),
 (' Om Per Erling Ramslands legat',
  '06.10.2022',
  'www.stavanger.kommune.no/nn/legater/per-erling-ramslands-legat/index.html'),
 (' P', '', 'www.stavanger.kommune.no/nn/skjema-a-aa/index.html'),
 (' Samfunnsutvikling',
  '06.10.2022',
  'www.stavanger.kommune.no/nn/legater/per-erling-ramslands-legat/index.html'),
 (' Per Erling Ramslands legat',
  '06.10.2022',
  'www.stavanger.kommune.no/nn/legater/per-erling-ramslands-legat/index.html'),
 (' Søknadsfrist',
  '06.10.2022',
  'www.stavanger.kommune.no/nn/legater/per-erling-ramslands-legat/index.html'),
 (' Søknadsskjema',
  '06.10.2022',
  'www.stavanger.kommune.no/nn/legater/per-erling-ramslands-legat/index.html')]

In [596]:
indexed_and_cleand_data = [(i,e.text.replace('\n', ' ').replace('\r', ' ').replace('==','').replace('\t', ' ')) for i,e in enumerate(res_cleared) if e.token_count_2 < 8190 and e.token_count_2 > 30 and e.embedding is None]
print('number of entries to process', len(indexed_and_cleand_data))
processed = 0
for batch in tqdm(chunked(indexed_and_cleand_data, 20), total=len(indexed_and_cleand_data)//20):   
    model = "text-embedding-ada-002"
    embeddings = openai.Embedding.create(input = [text for _, text in batch], model=model)
    for i, embedding in enumerate(embeddings['data']):
        res_cleared[batch[i][0]].set_embedding(embedding['embedding'])
    processed += len(batch)
    time.sleep(1)
print('number of processed entries', processed)

number of entries to process 12719


636it [18:57,  1.79s/it]                         

number of processed entries 12719





In [597]:
# number of entries to process 12719
# number of entries to process 15879
df = pd.DataFrame(e.__dict__ for e in res_cleared)
df.head(20)

Unnamed: 0,id,address,parent,children,header,paragraph_number,text,author,date,star_count,char_count,text_hash,embedding
0,3da26c24565d41d98be7203b5e3d04b5,www.stavanger.kommune.no/nn/nettstedskart/inde...,3da26c24565d41d98be7203b5e3d04b5,[],Nettstedskart,15,Nettstedskart - + Åpne alle\n * [Unknown I...,,,24,222220,3727822541725835575,
1,8376763e4fb945489500b025ec8c6d59,www.stavanger.kommune.no/nn/politikk/finn-poli...,e486fd853a294f348ddebef81cb242b5,[],H,26,"H - Hagen, Chanita Bjørnholdt (Rødt)\nHagen, ...",,,20,31681,-7911132056985941786,
2,87930f91a80d40b7ac515359b5427b15,www.stavanger.kommune.no/nn/politikk/finn-poli...,e486fd853a294f348ddebef81cb242b5,[],S,37,"S - Sakariassen, Eirik Faret (SV)\nSakariasse...",,,20,28899,8509952105144895095,
3,70776f73d0694211900b7fb488bbf7c2,www.stavanger.kommune.no/nn/politikk/finn-poli...,e486fd853a294f348ddebef81cb242b5,[],B,20,"B - Bakka, Bård Urbanski (SV)\nBakka, Bård Ur...",,,20,24410,4456622933094449403,
4,8b853fb08f5e49e694d546728ea58ea8,www.stavanger.kommune.no/om-stavanger-kommune/...,5477ddfaf234486a9c42c8ab546608d0,[],Seriøsitetsbestemmelser Stavanger kommune,18,Seriøsitetsbestemmelser Stavanger kommune - ...,,03.06.2022,20,23596,-7424391898506248631,"[-0.013609503395855427, -0.0005882888217456639..."
5,44b23d5a563c49b8937fdadedde4a872,www.stavanger.kommune.no/naring-og-arbeidsliv/...,79c8f7299b09416db82a92cba5e71783,[],6.1 Brytningstid,66,6.1 Brytningstid - Når denne planen skrives s...,,24.01.2023 12.29.21,20,19717,-4944443677513589319,"[-0.00238802726380527, -0.030221812427043915, ..."
6,825e2f27b63147cd85f15e47391bb7c8,www.stavanger.kommune.no/politikk/finn-politik...,18448b3cd3d74b10a54e270206e15d71,[],Kort fortalt,16,Kort fortalt - Her finner du oversikt over al...,,,20,20948,8428085813350174606,"[-0.00371743505820632, -0.011319935321807861, ..."
7,10b992d24d014f2e91afcd2446eb7b05,www.stavanger.kommune.no/nn/politikk/finn-poli...,10b992d24d014f2e91afcd2446eb7b05,[],Finn politikar,15,Finn politikar - [Unknown INPUT type] [Unknow...,,,24,20584,6416132697179666110,"[-0.014194659888744354, -0.025898069143295288,..."
8,b5a9882bbae44ed091d79768ee02c863,www.stavanger.kommune.no/nn/samfunnsutvikling/...,e759c99d04824338a129e2a923dc84b3,[],5.1 Eksisterende og framtidige risikofaktorer...,42,5.1 Eksisterende og framtidige risikofaktorer...,,19.10.2021 12,20,18205,-1832517965159402464,"[-0.0029011061415076256, -0.022215526551008224..."
9,6483d522703d4c0cb8b2a504df357d71,www.stavanger.kommune.no/helse-og-omsorg/infor...,5967c4924faf4df5a35f52351b272209,[],Januar 2021,49,Januar 2021 - – – – – – – – – – – – – – – – –...,,22.02.2023,20,13062,-4929450937363099814,"[0.0050765410996973515, -0.008910640142858028,..."


In [598]:
df.to_feather('cleand.feather')

In [599]:
pd.read_feather('cleand.feather')

Unnamed: 0,id,address,parent,children,header,paragraph_number,text,author,date,star_count,char_count,text_hash,embedding
0,3da26c24565d41d98be7203b5e3d04b5,www.stavanger.kommune.no/nn/nettstedskart/inde...,3da26c24565d41d98be7203b5e3d04b5,[],Nettstedskart,15,Nettstedskart - + Åpne alle\n * [Unknown I...,,,24,222220,3727822541725835575,
1,8376763e4fb945489500b025ec8c6d59,www.stavanger.kommune.no/nn/politikk/finn-poli...,e486fd853a294f348ddebef81cb242b5,[],H,26,"H - Hagen, Chanita Bjørnholdt (Rødt)\nHagen, ...",,,20,31681,-7911132056985941786,
2,87930f91a80d40b7ac515359b5427b15,www.stavanger.kommune.no/nn/politikk/finn-poli...,e486fd853a294f348ddebef81cb242b5,[],S,37,"S - Sakariassen, Eirik Faret (SV)\nSakariasse...",,,20,28899,8509952105144895095,
3,70776f73d0694211900b7fb488bbf7c2,www.stavanger.kommune.no/nn/politikk/finn-poli...,e486fd853a294f348ddebef81cb242b5,[],B,20,"B - Bakka, Bård Urbanski (SV)\nBakka, Bård Ur...",,,20,24410,4456622933094449403,
4,8b853fb08f5e49e694d546728ea58ea8,www.stavanger.kommune.no/om-stavanger-kommune/...,5477ddfaf234486a9c42c8ab546608d0,[],Seriøsitetsbestemmelser Stavanger kommune,18,Seriøsitetsbestemmelser Stavanger kommune - ...,,03.06.2022,20,23596,-7424391898506248631,"[-0.013609503395855427, -0.0005882888217456639..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17818,104afcd891694f279f9c0a97be465e5c,www.stavanger.kommune.no/nn/om-stavanger-kommu...,5f2e51f992b1405796bcdf6f6a90ee66,[],,63,-,,09.06.2022,4,8,-2807085881422901991,
17819,c0f2f24b3af14c2a8bc30577b83d2894,www.stavanger.kommune.no/nn/naring-og-arbeidsl...,4c2620c220b94a45b68c976f386e75b6,[],,18,-,,27.01.2022,10,4,7052270201380712647,
17820,021d150b74714eed8d442f1d2e991b55,www.stavanger.kommune.no/nn/bolig-og-bygg/kart...,ec4923e6f64344a9b56ca312a1abf5b1,[],,21,-,,18.10.2022,8,49,-6297959306800227963,
17821,a18ca61b4e2d4d839cc4b65b90d4b1c7,www.stavanger.kommune.no/nn/bolig-og-bygg/kart...,2b5c98f69bc94597a97f2b36ebfe353b,[],...,24,...,,18.10.2022,16,74,3788553208836580623,
