In [1]:
# The script is designed specifically for use in the JupyterHub environment, instructions on accessing it are in the Digilab environment: https://digilab.rara.ee/en/tools/access-to-dea-texts/#uagb-tabs__tab1. By modifying the script, bigrams could also be created for locally downloaded material.
# TThe scripts are based on the scripts of the previous project, located here: https://doi.org/10.17605/OSF.IO/EBW24.

In [2]:
suppressPackageStartupMessages(library(tidyverse,lib.loc="/gpfs/space/projects/digar_txt/R/4.3/"))
suppressPackageStartupMessages(library(tidytext,lib.loc="/gpfs/space/projects/digar_txt/R/4.3/"))

In [3]:
suppressPackageStartupMessages(library(digar.txts,lib.loc="/gpfs/space/projects/digar_txt/R/4.3/"))

In [4]:
library(tidyverse)
library(tidytext)

In [5]:
all_issues <- get_digar_overview()

[1] "Issue metadata read"


In [6]:
stopwords <- readLines("estonian-stopwords-lemmas.txt")
stopsonad <- data.frame(word = stopwords, stringsAsFactors = FALSE)

In [7]:
exile_newspapers <- all_issues %>%
    filter(DocumentType=="NEWSPAPER") %>%
    filter(year>=1944&year<=1991) %>%
    filter(keyid=="estdagbladet" | keyid=="eestiteatajastock" | keyid=="stockholmstid" | keyid=="teatajapoliit" | keyid=="valiseesti")

In [8]:
# Without lemmas.
#searchterm <- "uus kodu"
#searchfile <- "kodu.txt"
#do_subset_search(searchterm=searchterm, searchfile=searchfile, exile_newspapers, source="pages")

searchterm <- "kodu"
searchfile <- "kodu.txt"
do_subset_search(searchterm=searchterm, searchfile=searchfile, exile_newspapers, searchtype="lemmas", source="pages")

In [9]:
texts <- fread("kodu.txt",header=F,sep="\t", quote="")[,.(id=V1,lemmas=V2)]
texts[,DocumentID:=str_extract(id,"[^\\.]+")]
fileids_w_meta <- texts %>% select(id, DocumentID) %>% left_join(exile_newspapers %>% select(keyid, year, month, DocumentID, zippath_sections, zippath_pages), by=c("DocumentID"))

In [10]:
kodu_bigrams <- get_concordances(searchterm=searchterm,texts=texts,before=100,after=100,txt="lemmas",id="id")

In [11]:
kodu_concs <- kodu_bigrams[, !c("id","nr","begin","end")]

write.table(kodu_concs, "kodu_concs.txt", sep="\t", quote = F, row.names = F)

In [12]:
kodu_concs2 <- str_replace_all(kodu_concs$context, "\\|", "_")

kodu_concs3 <- as.data.frame(kodu_concs2, header = TRUE)
colnames(kodu_concs3) <- "context"

In [13]:
bigrams_separated <- kodu_concs3 %>%
  unnest_tokens(bigram, context, token = "ngrams", n = 2, n_min = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(grepl("kodu", word1) | grepl("kodu", word2))

In [15]:
bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stopsonad$word) %>%
  filter(!word2 %in% stopsonad$word)

In [16]:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

top_1_percent <- bigram_counts %>%
  arrange(desc(n)) %>%
  slice_head(n = floor(0.0001 * nrow(.)))

In [20]:
head(bigram_counts, 10)

Unnamed: 0_level_0,word1,word2,n
Unnamed: 0_level_1,<chr>,<chr>,<int>
1,eesti,kodu,5943
2,kodumaa,lahkumine,1718
3,okupeerima_okupeeritud,kodumaa,1169
4,uus,kodu,938
5,suvikodu,kuratoorium,873
6,laps,suvekodu,774
7,vaba,kodumaa,760
8,kodu,ruum,717
9,kodu,eesti,715
10,kodumaa,vabastamine,672
