# Data Frame Manipulation -- R dplyr
Basic data frame manipulation using base R data frames.

In [1]:
library(magrittr)
library(dplyr)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



## Load csv

In [2]:
df <- read.csv("../_data/titanic.csv")
head(df, 3)

pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
<int>,<int>,<fct>,<fct>,<dbl>,<int>,<int>,<fct>,<dbl>,<fct>,<fct>,<fct>,<int>,<fct>
1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


## Summary

In [3]:
# Rows
nrow(df)

In [4]:
# Columns
length(df)

In [5]:
# Inspect types
str(df)

'data.frame':	1309 obs. of  14 variables:
 $ pclass   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ survived : int  1 1 0 0 0 1 1 0 1 0 ...
 $ name     : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 22 24 25 26 27 31 46 47 51 55 ...
 $ sex      : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
 $ age      : num  29 0.917 2 30 25 ...
 $ sibsp    : int  0 1 1 1 1 0 1 0 2 0 ...
 $ parch    : int  0 2 2 2 2 0 0 0 0 0 ...
 $ ticket   : Factor w/ 929 levels "110152","110413",..: 188 50 50 50 50 125 93 16 77 826 ...
 $ fare     : num  211 152 152 152 152 ...
 $ cabin    : Factor w/ 187 levels "","A10","A11",..: 45 81 81 81 81 151 147 17 63 1 ...
 $ embarked : Factor w/ 4 levels "","C","Q","S": 4 4 4 4 4 4 4 4 4 2 ...
 $ boat     : Factor w/ 28 levels "","1","10","11",..: 13 4 1 1 1 14 3 1 28 1 ...
 $ body     : int  NA NA NA 135 NA NA NA NA NA 22 ...
 $ home.dest: Factor w/ 370 levels "","?Havana, Cuba",..: 310 232 232 232 232 238 163 25 23 230 ...


In [6]:
# Descriptive statistics
summary(df)

     pclass         survived                                   name     
 Min.   :1.000   Min.   :0.000   Connolly, Miss. Kate            :   2  
 1st Qu.:2.000   1st Qu.:0.000   Kelly, Mr. James                :   2  
 Median :3.000   Median :0.000   Abbing, Mr. Anthony             :   1  
 Mean   :2.295   Mean   :0.382   Abbott, Master. Eugene Joseph   :   1  
 3rd Qu.:3.000   3rd Qu.:1.000   Abbott, Mr. Rossmore Edward     :   1  
 Max.   :3.000   Max.   :1.000   Abbott, Mrs. Stanton (Rosa Hunt):   1  
                                 (Other)                         :1301  
     sex           age              sibsp            parch      
 female:466   Min.   : 0.1667   Min.   :0.0000   Min.   :0.000  
 male  :843   1st Qu.:21.0000   1st Qu.:0.0000   1st Qu.:0.000  
              Median :28.0000   Median :0.0000   Median :0.000  
              Mean   :29.8811   Mean   :0.4989   Mean   :0.385  
              3rd Qu.:39.0000   3rd Qu.:1.0000   3rd Qu.:0.000  
              Max.   :80.0

## Slice data

In [7]:
# Select rows by number
df[2:4,]

Unnamed: 0_level_0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
Unnamed: 0_level_1,<int>,<int>,<fct>,<fct>,<dbl>,<int>,<int>,<fct>,<dbl>,<fct>,<fct>,<fct>,<int>,<fct>
2,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"


In [8]:
# Select rows by condition
df[df[, "Survived"] == 1,] %>% head(3)

ERROR: Error in `[.data.frame`(df, , "Survived"): undefined columns selected


In [None]:
# Select rows by multiple conditions
df[df[, "Survived"] == 1 & df[, "Sex"] == "female",] %>% head(3)

In [None]:
# Select columns by name
df[, c("Survived", "Pclass")] %>% head(3)

## Assign values

In [None]:
# Assign a (new) column
df$sparkles <- 8
head(df, 3)

In [None]:
# Assign value to some rows
df[df[, "Survived"] == 1, "sparkles"] <- 3
head(df, 3)

## Group by

## Reshape

In [None]:
# Gather / melt

In [None]:
# Spread / cast / pivot

## Join

## Creation

In [None]:
df2 <- data.frame(
    x = c(1,2,3)
    , y = c('a', 'b', 'c')
    , z = c(TRUE, FALSE, TRUE)
)
df2