In [2]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from six.moves import urllib
import os
import nltk
from nltk.corpus import wordnet as wn


## 1. Get synsets for target words

In [3]:
homo_syns = wn.synsets('homo',lang='nld')[0].lemma_names('nld')
zigeuner_syns = wn.synsets('zigeuner', lang='nld')[1].lemma_names('nld')
neger_syns = wn.synsets('neger',lang='nld')[1].lemma_names('nld')
all_syns = homo_syns + neger+_syns + zigeuner_syns

In [4]:
woorden_eruit = ['zwarte','poot', 'sodemiet','sodemieter','reetkever','holtor','geïnverteerde','mietje','bruinwerker','nicht','nikker','roetmop']
all_syns = [i for i in all_syns if i not in woorden_eruit]

['Roma', 'zigeunerin', 'gipsy', 'zigeuner']

## 2. Download data from KB API

In [5]:
apikey = #fill in your API key
startdate =  "1990-01-01"
enddate = "1999-12-31" 

In [6]:
def get_article_metadata(apikey,startdate,enddate,search_term,ppn,max_records,decade):
    daterange = str(startdate + "%20" + enddate)
    recordsdf = pd.DataFrame(columns = ['article_id','date','article_type'])
                             
    #get total number of records
    start_record = 0
    daterange = str(startdate + "%20" + enddate)
    query = f"http://jsru.kb.nl/sru/sru/{apikey}?operation=SearchRetrieve&query={search_term}%20and%20(date%20within%20\"{daterange}\")%20and%20(ppn={ppn})&x-collection=DDD_artikel&recordSchema=dc&startRecord={start_record}&maximumRecords={max_records}"
    url = requests.get(query)
    soup = BeautifulSoup(url.content,"xml")
    for item in soup.findAll('srw:searchRetrieveResponse'):
        records = item.find('srw:numberOfRecords').text

    while start_record <= int(records):
        query = f"http://jsru.kb.nl/sru/sru/{apikey}?operation=SearchRetrieve&query={search_term}%20and%20(date%20within%20\"{daterange}\")%20and%20(ppn={ppn})&x-collection=DDD_artikel&recordSchema=dc&startRecord={start_record}&maximumRecords={max_records}"
        url = requests.get(query)
        soup = BeautifulSoup(url.content,"xml")
        for i in soup.findAll('srw:recordData'):
            article_id = i.find('dc:identifier').text
            date = i.find('dc:date').text
            article_type = i.find('dc:type').text
            recordsdf = recordsdf.append({'article_id': article_id, 'date':date, 'article_type':article_type},ignore_index=True) 
        start_record += 1000
        
    recordsdf['ppn'] = ppn
    recordsdf['search_term'] = search_term  

    
    recordsdf.to_csv(f"article_metadata_{decade}_{search_term}_{ppn}_2107.csv")

    return(list(set(list(recordsdf['article_id']))))

In [7]:
def get_article_text(article_ids,search_term,decade,ppn):
    texts = []
    #df = pd.DataFrame(columns = ['article_id','text'])
    for article in article_ids:
        url = requests.get(article)
        soup = BeautifulSoup(url.content, "xml")
        text = [i.text for i in soup.findAll('text')]
        texts.append(text)
        #df = df.append([{'article_id':article,'text':text}],ignore_index=True)
    df = pd.DataFrame(list(zip(article_ids, texts)), columns = ['article_id','text'])
    df['decade'] = decade
    df.to_csv(f"text_{decade}_{search_term}_{ppn}_2107.csv")
    #return df

In [9]:
decades = [1940, 1950, 1960, 1970, 1980, 1990]
ppis = ["400367629", "412789353", "412869594", "832675288"]

In [10]:
%%time

for i in decades:
    for j in ppis:
        for k in all_syns:
            articles = get_article_metadata(apikey,f"01-01-{i}",f"31-12-{i+9}",str(k),str(j),"2000",i)
            get_article_text(articles,k,i,j)



CPU times: user 2min 57s, sys: 19.1 s, total: 3min 16s
Wall time: 51min 43s


In [12]:
def get_total_results(apikey,startdate,enddate,ppn,max_records):
    daterange = str(startdate + "%20" + enddate)
    query = f"http://jsru.kb.nl/sru/sru/{apikey}?operation=SearchRetrieve&query=date%20within%20\"{daterange}\"%20and%20(ppn={ppn})&x-collection=DDD_artikel&recordSchema=dc&maximumRecords={max_records}"
    #print(query)
    url = requests.get(query)
    soup = BeautifulSoup(url.content,"xml")
    total_results = soup.find('srw:numberOfRecords')
    return total_results.text

def get_total_results_na(apikey,startdate,enddate,ppn,max_records):
    daterange = str(startdate + "%20" + enddate)
    query = f"http://jsru.kb.nl/sru/sru/{apikey}?operation=SearchRetrieve&query=date%20within%20\"{daterange}\"%20and%20(ppn={ppn})%20and%20(type=artikel)&x-collection=DDD_artikel&recordSchema=dc&maximumRecords={max_records}"
    #print(query)
    url = requests.get(query)
    soup = BeautifulSoup(url.content,"xml")
    total_results = soup.find('srw:numberOfRecords')
    return total_results.text


In [14]:
%%time
rows = []
na_rows = []
decades = [1940,1950,1960,1970,1980,1990]
for i in decades:
    for j in ppis:
            rows.append([i,j,get_total_results(apikey,f"01-01-{i}",f"31-12-{i+9}",str(j),"100000")])
            na_rows.append([i,j,get_total_results_na(apikey,f"01-01-{i}",f"31-12-{i+9}",str(j),"100000")])

CPU times: user 25 s, sys: 863 ms, total: 25.9 s
Wall time: 1min 30s


In [15]:
all_rows_df = pd.DataFrame(rows,columns=['decade','ppn','total_results'])

na_rows_df = pd.DataFrame(na_rows,columns=['decade','ppn','total_results'])

In [1]:
all_rows_df.to_csv('total_number_articles.csv')
na_rows_df.to_csv('na_number_articles.csv')

NameError: name 'all_rows_df' is not defined