# PC Session 1

**Author:**
[Helge Liebert](https://hliebert.github.io/)

# Reading the pdf files content as data

### Libraries

In [None]:
library("stringr")
library("readr")

### Extract source files

You will need to extract the source files to a folder. Pick the character encoding that fits your operating system.

In [None]:
## use txt-utf-8.zip if you are on MacOS or Linux
## unzip("txt.zip")

In [None]:
## use txt-latin-1.zip if you are on Windows
## unzip("txt-latin-1.zip")

### Get file names, w/ and w/o paths

In [None]:
files <- list.files(path = "txt/", pattern = "*.txt", full.names = TRUE)
head(files)

In [None]:
names <- list.files(path = "txt/", pattern = "*.txt")
head(names)

### Read files

In [None]:
## read only first 5000 bytes, to preserve memory
content <- lapply(files, function(f) readChar(f, nchars = 5000))

## read all
## content <- lapply(files, readr::read_file)
## content <- lapply(files, function(f) readChar(f, nchars = file.info(f)$size))
                  
head(content)

### Read as data

In [None]:
data <- as.data.frame(cbind(names, content))
head(data)

### Extract more info from file name

In [None]:
## regex to get author names
data$names <- gsub("\\.txt$", "", data$names)
data$author <- gsub(" - .*$", "", data$names)
head(data)

In [None]:
## cleaner, no false positives (check first obs)
data$author <- str_extract(data$names, "^.*?( - )")
data$author <- gsub(" - ", "", data$author)
head(data)

In [None]:
## same for year
data$year <- str_extract(data$names, " - (20|19)[0-9][0-9] - ")
data$year <- gsub(" - ", "", data$year)
head(data)

In [None]:
## same for title
#data$title <- str_extract(data$names, " - .*?$") ## not good, title may contain hyphen
data$title <- str_extract(data$names, " - (20|19)[0-9][0-9] - .*$")
data$title <- gsub("^ - (20|19)[0-9][0-9] - ", "", data$title)
head(data)

In [None]:
## trim whitespace everywhere
data$author <- trimws(data$author)
data$year <- trimws(data$year)
data$title <- trimws(data$title)
head(data)

### Filter/clean content

In [None]:
## remove supplementary material
data <- data[!grepl("^Supplemental", data$content), ]

In [None]:
## check initial content metadata
data$content[5]

In [None]:
## remove JSTOR metadata page
data$content <- gsub("^.* are collaborating with JSTOR to digitize.*?\\.", "", data$content)
data$content[5]

In [None]:
## More
## ...

# Read single text file and transform it to a data frame

In [None]:
jobs <- read_file("example-unix.txt")
## jobs <- read_file("example.txt")

In [None]:
## TASK: Create a data frame with ids in one column and job ad text in another
## ...