# R: Data Structures

## 1. Vectors
R's vectors are consist of homogeneous data types and can be calculated elementwisely. They are similar to Numpy's arrays.

### 1.1. Creating vectors

In [1]:
c(1, 3, 6)

In [2]:
0:9

In [3]:
month.name

In [4]:
# homogeneous data type
c(100, 'R', pi, TRUE)

In [5]:
set1 = c(1:5)
set2 = c(7:9)

c(set1, set2)

### 1.2. Vector manipulation

In [6]:
price <- c(23, 35, 18)
quantity <- c(2, 4, 7)

In [7]:
price * quantity

In [8]:
sum(price)

In [9]:
mean(quantity)

In [10]:
min(price)

In [11]:
'data' %in% c('jupyter', 'math', 'data')

In [12]:
sort(month.name, decreasing=TRUE)

In [13]:
nchar(month.name)

In [14]:
length(month.abb)

#### Working with 2 vectors

In [15]:
set1 = c(1:2, 1:7)
set2 = 5:9

In [16]:
union(set1, set2)

In [17]:
intersect(set1, set2)

### 1.3. Slicing

#### Index slicing

In [18]:
month.name[5]

In [19]:
month.name[3:5]

In [20]:
# negative index: drop element in this position
month.name[-12]

#### Boolean slicing

In [21]:
x = 1:9
x[x < 5]

In [22]:
x = 1:9
x[x%%2 == 0]

### 1.4. Factors
Factor is the ordered version of vector.

In [23]:
x <- c('easy', 'medium', 'easy', 'hard', 'hard', 'easy', 'medium')
x <- factor(x, levels=c('easy', 'medium', 'hard'))

sort(x)

## 2. Matrices

### 2.1. Creating matrices

In [24]:
matrix(1:20, nrow=4, ncol=5)

0,1,2,3,4
1,5,9,13,17
2,6,10,14,18
3,7,11,15,19
4,8,12,16,20


In [25]:
matrix(1:20, nrow=4, ncol=5, byrow=TRUE)

0,1,2,3,4
1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
16,17,18,19,20


In [26]:
matrix(1:10, nrow=2, byrow=TRUE)

0,1,2,3,4
1,2,3,4,5
6,7,8,9,10


In [27]:
x = matrix(1:10, nrow=2)
dim(x)

### 2.2. Matrix slicing
Convention: the vertical axis is the first dimension, the horizontal axis is the second dimension.

In [28]:
x = matrix(1:20, nrow=4, ncol=5, byrow=TRUE)
x

0,1,2,3,4
1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
16,17,18,19,20


In [29]:
# element in row 2, column 3
x[2, 3]

In [30]:
# row 1 and 4
x[c(1,4),]

0,1,2,3,4
1,2,3,4,5
16,17,18,19,20


In [31]:
# item assignment
x[2,] = 0
x

0,1,2,3,4
1,2,3,4,5
0,0,0,0,0
11,12,13,14,15
16,17,18,19,20


### 2.3. Matrix calculation

#### Addition

In [32]:
x <- matrix(1:6, nrow=2, ncol=3, byrow=TRUE)
y <- matrix(10:15, nrow=2, ncol=3, byrow=TRUE)

x + y

0,1,2
11,13,15
17,19,21


#### Multiplication

In [33]:
x <- matrix(1:6, nrow=2, ncol=3, byrow=TRUE)
y <- matrix(1:12, nrow=3, ncol=4, byrow=TRUE)

x %*% y

0,1,2,3
38,44,50,56
83,98,113,128


#### The transpose

In [34]:
x <- matrix(1:8, ncol=2, byrow=TRUE)
t(x)

0,1,2,3
1,3,5,7
2,4,6,8


#### The diagonal

In [35]:
x <- matrix(1:9, ncol=3, byrow=TRUE)
diag(x)

#### Eigenvalues and eigenvectors

In [36]:
x <- matrix(1:9, nrow=3, byrow=TRUE)
eig = eigen(x)
eig

eigen() decomposition
$values
[1]  1.611684e+01 -1.116844e+00 -1.303678e-15

$vectors
           [,1]        [,2]       [,3]
[1,] -0.2319707 -0.78583024  0.4082483
[2,] -0.5253221 -0.08675134 -0.8164966
[3,] -0.8186735  0.61232756  0.4082483


## 3. List
List is a data structure having components of mixed data types. R lists are similar to Python dictionaries.

### 3.1. Creating lists

In [37]:
list('jupyter', 75, TRUE, pi)

In [38]:
name = 'Hung'
title = 'Data Analyst'
age = 25
skills = c('Python', 'SQL', 'R', 'Tableau', 'Excel')

list(name=name, title=title, age=age, skills=skills)

### 3.2. List slicing

In [39]:
name = 'Hung'
title = 'Data Analyst'
age = 25
skills = c('Python', 'SQL', 'R', 'Tableau', 'Excel')

me = list(name=name, title=title, age=age, skills=skills)
me

#### Index slicing

In [40]:
# select a smaller list
me[2]

In [41]:
# select an element
me[[2]]

#### Name slicing

In [42]:
# select a smaller list
me['skills']

In [43]:
# select an element
me[['skills']]

In [44]:
# select an element
me$skills

In [45]:
# item assignment
me$gender = 'male'

In [46]:
me

## 4. Dataframe
Dataframe is a labeled matrix storing tabular data.

### 4.1. Creating dataframes

In [47]:
year = c(2001, 2002, 2003, 2004, 2005)
price = c(100, 110, 120, 130, 140)
sales = c(100000, 105000, 109000, 120000, 125000)

report = data.frame(year, price, sales)
report

year,price,sales
<dbl>,<dbl>,<dbl>
2001,100,100000
2002,110,105000
2003,120,109000
2004,130,120000
2005,140,125000


In [48]:
names(report)

In [49]:
summary(report)

      year          price         sales       
 Min.   :2001   Min.   :100   Min.   :100000  
 1st Qu.:2002   1st Qu.:110   1st Qu.:105000  
 Median :2003   Median :120   Median :109000  
 Mean   :2003   Mean   :120   Mean   :111800  
 3rd Qu.:2004   3rd Qu.:130   3rd Qu.:120000  
 Max.   :2005   Max.   :140   Max.   :125000  

In [50]:
ncol(report)
nrow(report)

### 4.2. Reading files

In [51]:
df <- read.table('../data/mtcars.txt', header=TRUE, sep='\t')
head(df)

Unnamed: 0_level_0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Unnamed: 0_level_1,<fct>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>
1,Mazda RX4,21.0,6,160,110,3.9,2.62,16.46,0,1,4,4
2,Mazda RX4 Wag,21.0,6,160,110,3.9,2.875,17.02,0,1,4,4
3,Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
4,Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
5,Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
6,Valiant,18.1,6,225,105,2.76,3.46,20.22,1,0,3,1


In [52]:
df <- read.csv('../data/mtcars.csv', header=T)
head(df)

Unnamed: 0_level_0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Unnamed: 0_level_1,<fct>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>
1,Mazda RX4,21.0,6,160,110,3.9,2.62,16.46,0,1,4,4
2,Mazda RX4 Wag,21.0,6,160,110,3.9,2.875,17.02,0,1,4,4
3,Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
4,Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
5,Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
6,Valiant,18.1,6,225,105,2.76,3.46,20.22,1,0,3,1


In [53]:
library(readxl)
df <- read_excel('../data/mtcars.xlsx')
head(df)

"package 'readxl' was built under R version 3.6.3"


model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Mazda RX4,21.0,6,160,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225,105,2.76,3.46,20.22,1,0,3,1


#### Saving file
The `save()` function save a dataframe as a file. `.rda` or other file types may be used.

In [54]:
save(mtcars, file='../data/mtcars.rda')

In [55]:
load('../data/mtcars.rda')

### 4.3. Dataframe slicing

In [58]:
# row 3, column 1
df[[3,1]]

In [59]:
df[,1]

model
<chr>
Mazda RX4
Mazda RX4 Wag
Datsun 710
Hornet 4 Drive
Hornet Sportabout
Valiant
Duster 360
Merc 240D
Merc 230
Merc 280


In [60]:
df[1,]

model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4


In [61]:
df[1:4,]

model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Mazda RX4,21.0,6,160,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1


In [62]:
df$model

In [63]:
df[df$cyl == 6,]

model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4
Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4
Ferrari Dino,19.7,6,145.0,175,3.62,2.77,15.5,0,1,5,6
