# Lawlingo
Importing stuff

In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
import os
import urllib.request
import regex as re
import pandas as pd
from joblib import Memory
from datetime import datetime
import textract
from collections import Counter
memory = Memory('./cache', verbose=1)

In [193]:
BASE_URL = "https://www.domstol.se/"
URL = BASE_URL + "hogsta-domstolen/avgoranden/?f=DecisionType_list:decision"

@memory.cache
def decision_urls():
    BANNER_CLASS_NAME = "banner__button"
    MORE_CLASS_NAME = "search-result-item__show-more-btn"
    
    urls = []
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 10)
    driver.get(URL)
    
    banner_button = driver.find_element_by_class_name(BANNER_CLASS_NAME)
    more_button = driver.find_element_by_class_name(MORE_CLASS_NAME)
    
    wait.until(EC.element_to_be_clickable((By.CLASS_NAME, BANNER_CLASS_NAME)))
    banner_button.click()
        
    while True:
        more_button.click()
        try:
            wait.until(EC.element_to_be_clickable((By.CLASS_NAME, MORE_CLASS_NAME)))
        except:
            break
        
    results = driver.find_elements_by_class_name("u-pr-huge--large")
    urls =  [item.get_attribute("href") for item in results]
    driver.close()
    return urls


@memory.cache
def data(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")

    pdf_element = soup.find("a", href=re.compile(r".+\.pdf"))
    date_element = soup.find("span", class_ = "publisher__byline")
   
    date = datetime.fromisoformat(date_element.text[:-1]).date()
    if pdf_element is not None:
        pdf_url = BASE_URL + pdf_element["href"]
        file_name = re.findall(r"(.+\/)*(.+\..+)$", pdf_url)[0][1]
        urllib.request.urlretrieve(pdf_url, "./cache/temp.pdf")
        text = textract.process('./cache/temp.pdf', encoding='utf-16').decode('utf-16')
        return {"date": date, "url": url, "text": text}
        
    else:
        return None
    
def words(text):
    text = text.lower()
    text = re.sub('\n', " ", text)
    text = re.sub('[/./?,!/:/(/)_]', "", text)
    words = text.split()
    return words

def n_matches(word_stem, count_dict):
    n = 0
    key = r"^" + word_stem
    for word in count_dict.keys():
        if re.findall(key, word):
            n += count_dict[word]
    return n

In [194]:
df = pd.DataFrame([data(url) for url in decision_urls() if data(url) is not None])
df['word_count'] = list(map(lambda x: Counter(words(x)), df['text']))
df['anhängigg'] = list(map(lambda x: n_matches('anhängigg', x), df['word_dict']))
df.set_index('date')

Unnamed: 0_level_0,url,text,word_count,anhängigg
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-04-08,https://www.domstol.se/hogsta-domstolen/avgora...,Sida 1 (8)\n\nHÖGSTA DOMSTOLENS\nBESLUT\n\nMål...,"{'sida': 9, '1': 9, '8': 5, 'högsta': 14, 'dom...",0
2020-04-08,https://www.domstol.se/hogsta-domstolen/avgora...,Sida 1 (17)\n\nHÖGSTA DOMSTOLENS\nBESLUT\n\nMå...,"{'sida': 21, '1': 10, '17': 3, 'högsta': 28, '...",0
2020-04-03,https://www.domstol.se/hogsta-domstolen/avgora...,Sida 1 (15)\n\nHÖGSTA DOMSTOLENS\nDOM\n\nMål n...,"{'sida': 15, '1': 5, '15': 4, 'högsta': 28, 'd...",0
2020-04-03,https://www.domstol.se/hogsta-domstolen/avgora...,Sida 1 (11)\n\nHÖGSTA DOMSTOLENS\nBESLUT\n\nMå...,"{'sida': 12, '1': 4, '11': 4, 'högsta': 18, 'd...",0
2020-04-02,https://www.domstol.se/hogsta-domstolen/avgora...,Sida 1 (12)\n\nHÖGSTA DOMSTOLENS\nBESLUT\n\nMå...,"{'sida': 13, '1': 8, '12': 4, 'högsta': 25, 'd...",0
...,...,...,...,...
2004-12-15,https://www.domstol.se/hogsta-domstolen/avgora...,Sida 1 (4)\n\nHÖGSTA DOMSTOLENS\nDOM\n\nMål nr...,"{'sida': 4, '1': 2, '4': 5, 'högsta': 9, 'doms...",0
2004-12-13,https://www.domstol.se/hogsta-domstolen/avgora...,Sida 1 (4)\n\nHÖGSTA DOMSTOLENS\nBESLUT\n\nMål...,"{'sida': 3, '1': 8, '4': 3, 'högsta': 10, 'dom...",0
2004-12-09,https://www.domstol.se/hogsta-domstolen/avgora...,Sida 1 (11)\n\nHÖGSTA DOMSTOLENS\nBESLUT\n\nMå...,"{'sida': 12, '1': 4, '11': 12, 'högsta': 45, '...",0
2004-12-07,https://www.domstol.se/hogsta-domstolen/avgora...,Sida 1 (3)\n\nHÖGSTA DOMSTOLENS\nBESLUT\n\nMål...,"{'sida': 4, '1': 6, '3': 4, 'högsta': 10, 'dom...",0
