# Introduction to R

## Basic arithmetic

In [None]:
1 + 3.14159

In [None]:
pi / 2

In [None]:
exp(1)

$e^1$

In [None]:
exp(1i)

In [None]:
exp(1) < pi

## Vectors

In [None]:
c(1, 2, 3, 4)

In [None]:
aa = 3

In [None]:
aa

In [None]:
aa <- 3

In [None]:
xx = c(2, 3, 5, 7)

In [None]:
xx + 1

In [None]:
2*xx

In [None]:
xx^2

In [None]:
yy = c(1, 2, 3, 4)

In [None]:
xx * yy

In [None]:
mean(xx)

In [None]:
sum((xx - 1)^2)

In [None]:
sum(abs(xx - 1))

In [None]:
xx < 5

In [None]:
xx

In [None]:
xx[1]

In [None]:
xx[c(2, 3)]

In [None]:
xx[c(T, T, F, F)]

In [None]:
xx[xx < 5]

## Plotting (1)

In [None]:
?plot

In [None]:
plot(xx)

In [None]:
plot(c(5, 7, 1, 9, 9, 3))

In [None]:
1:4

In [None]:
plot(c(2, 4, 6, 8), xx)

In [None]:
plot(xx, xx)

In [None]:
plot(xx, xlab="My x-axis", ylab="My y-axis")

In [None]:
plot(xx, type='l')

In [None]:
?points

In [None]:
plot(xx, pch=10, cex=50)

In [None]:
yy = sapply(seq(0, 10, length.out=100), function(mm) sum(abs(xx - mm)))

In [None]:
plot(seq(0, 10, length.out=100), yy)

# Air Passenger Data

In [None]:
data(AirPassengers)

In [None]:
?AirPassengers

In [None]:
plot(AirPassengers)

In [None]:
hist(diff(AirPassengers))

In [None]:
length(AirPassengers)

In [None]:
mean(AirPassengers[2:144] / AirPassengers[1:143])

In [None]:
plot(AirPassengers)
lines(1949:1961, 100 * 1.01516853565818^(12 * (0:12)))

Annual peaks

In [None]:
AirPassengers

In [None]:
times = time(AirPassengers)

In [None]:
annmax = c()
annmin = c()
for (year in 1949:1960) {
    maxval = max(AirPassengers[time(AirPassengers) >= year & time(AirPassengers) < year + 1])
    annmax = c(annmax, maxval)
    minval = min(AirPassengers[time(AirPassengers) >= year & time(AirPassengers) < year + 1])
    annmin = c(annmin, minval)
}

In [None]:
annmax

In [None]:
annmin

In [None]:
length(annmax)

In [None]:
mean(annmax[2:12] / annmax[1:11])

In [None]:
mean(annmin[2:12] / annmin[1:11])

In [None]:
plot(AirPassengers)
lines(1949:1961, 148 * 1.1401131585382^(0:12))
lines(1949:1961, 104 * 1.1300999993363^(0:12))

In [None]:
plot(1949:2025, 148 * 1.1401131585382^(0:76))
lines(1949:2025, 104 * 1.1300999993363^(0:76))

In [None]:
plot(log(AirPassengers))

In [None]:
df = data.frame(month=time(AirPassengers), pass=AirPassengers)

In [None]:
head(df)

In [None]:
summary(lm(log(pass) ~ month, data=df))

In [None]:
df$pass1 = c(NA, df$pass[1:(nrow(df)-1)])

In [None]:
df$pass2 = c(NA, NA, df$pass[1:(nrow(df)-2)])

In [None]:
summary(lm(log(pass) ~ month + log(pass1) + log(pass2), data=df))

In [None]:
mod = lm(log(pass) ~ month + log(pass1) + log(pass2), data=df)
df$pred = predict(mod, df)

In [None]:
plot(AirPassengers)
points(time(AirPassengers), exp(df$pred))

## Data frames

In [None]:
df = data.frame(toc=c(3, 5, 7), salinity=c(5, 6, 7))

In [None]:
write.csv(df, "myfile.csv")

In [None]:
read.csv("myfile.csv")

In [None]:
df$toc

In [None]:
df$toc == c(3, 5, 7)

In [None]:
df$toc[3]

In [None]:
df$salinity[df$toc < 7]

## Plotting (2)

In [None]:
install.packages("tidyverse")

In [None]:
library(tidyverse)

We're going to use a lot of `ggplot2`.

https://nyu-cdsc.github.io/learningr/assets/data-visualization-2.1.pdf

In [None]:
df

In [None]:
ggplot(df, aes(x=toc, y=salinity)) + geom_point() + geom_line() + geom_col() +
  xlab("My great x-axis") + theme_bw()

# Discoveries data

In [None]:
data(discoveries)

In [None]:
?discoveries

In [None]:
discoveries

In [None]:
plot(discoveries)

In [None]:
df = data.frame(year=1860:1959, count=discoveries)

In [None]:
df

In [None]:
mean(df$count)

In [None]:
mean(diff(df$count))

In [None]:
plot(df$count[2:nrow(df)] * df$count[1:nrow(df)-1])

In [None]:
hist(df$count)

In [None]:
hist(diff(df$count))

In [None]:
hist(log(df$count))

In [None]:
hist(diff(log(df$count)), 30)

In [None]:
ggplot(df, aes(year, count)) + geom_point()

In [None]:
?geom_smooth

In [None]:
ggplot(df, aes(year, count)) + geom_point() + geom_smooth()

In [None]:
sumdf = data.frame(count=mean(df$count))

In [None]:
ggplot(df, aes(year, count)) + geom_point() + geom_hline(data=sumdf, aes(yintercept=count))

But actually seems to change. Let's do decadal averages!

In [None]:
seq(1860, 1959, by=10)

In [None]:
df$year >= 1860 & df$year < 1870

In [None]:
df$count[df$year >= 1860 & df$year < 1870]

In [None]:
sumdf = data.frame()
for (decade in seq(1860, 1959, by=10)) {
    mu = mean(df$count[df$year >= decade & df$year < decade + 10])
    sumdf = rbind(sumdf, data.frame(decade=decade, count=mu))
}

In [None]:
ggplot(df, aes(year, count)) + geom_point() + geom_col(data=sumdf, aes(x=decade + 5, count), alpha=.5)

Rolling 10-year windows

In [None]:
sumdf = data.frame()
for (year1 in 1860:1950) {
    mu = mean(df$count[df$year >= year1 & df$year < year1 + 10])
    sumdf = rbind(sumdf, data.frame(year1=year1, count=mu))
}

In [None]:
head(sumdf)

In [None]:
ggplot(df, aes(year, count)) + geom_point() + geom_line(data=sumdf, aes(x=year1 + 5, count))

In [None]:
sumdf = data.frame()
for (year1 in 1860:1940) {
    mu = mean(df$count[df$year >= year1 & df$year < year1 + 20])
    sumdf = rbind(sumdf, data.frame(year1=year1, count=mu))
}

In [None]:
sumdf2 = data.frame()
for (year1 in 1860:1940) {
    mu = median(df$count[df$year >= year1 & df$year < year1 + 20])
    sumdf2 = rbind(sumdf2, data.frame(year1=year1, count=mu))
}

In [None]:
ggplot(df, aes(year, count)) + geom_point() + geom_line(data=sumdf, aes(x=year1 + 10, count)) +
  geom_line(data=sumdf2, aes(x=year1 + 5, count))