# PC Session 1

**Author:**
[Helge Liebert](https://hliebert.github.io/)

# Reading the pdf files content as data

### Libraries

In [None]:
library(stringr)
library(readr)

### Get file names, w/ and w/o paths

In [None]:
files <- list.files(path = "txt/", pattern = "*.txt", full.names = TRUE)
names <- list.files(path = "txt/", pattern = "*.txt")
head(files)

### Read files

In [None]:
## read only first 100 bytes, to preserve memory
content <- lapply(files, function(f) readChar(f, nchars = 100))

## read all
## content <- lapply(files, readr::read_file)
## content <- lapply(files, function(f) readChar(f, nchars = file.info(f)$size))
                  
head(content)

### Read as data

In [None]:
data <- as.data.frame(cbind(names, content))
head(data)

### Extract more info from file name

In [None]:
## regex to get author names
data$names <- gsub("\\.txt$", "", data$names)
data$author <- gsub(" - .*$", "", data$names)
head(data)

In [None]:
## cleaner, no false positives (check first obs)
data$author <- str_extract(data$names, "^.*?( - )")
data$author <- gsub(" - ", "", data$author)
head(data)

In [None]:
## same for year
data$year <- str_extract(data$names, " - (20|19)[0-9][0-9] - ")
data$year <- gsub(" - ", "", data$year)
head(data)

In [None]:
## same for title
data$title <- str_extract(data$names, " - .*$") ## not good, title may contain hyphen
data$title <- str_extract(data$names, " - (20|19)[0-9][0-9] - .*$")
data$title <- gsub("^ - (20|19)[0-9][0-9] - ", "", data$title)
head(data)

In [None]:
## trim whitespace everywhere
data$author <- trimws(data$author)
data$year <- trimws(data$year)
data$title <- trimws(data$title)
head(data)

# Read csv-converted example file 

### Read file

In [None]:
example <- read.table("example.csv", sep =";") 
options(scipen = 9999)
stopifnot(is.na(example$V3))
example$V3 <- NULL

names(example) <- c("id", "ad")
example$ad <- trimws(example$ad)
head(example)

### Fix character encoding

In [None]:
Encoding(example$ad) <- "UTF-8" 
head(example)