# Loading in Data From Compressed File
I load data from the compressed file and use limited memory to do so. I select only the subset of columns I need.

In [1]:
# <INPUT Required> Put path to compressed file here
fpath = "/home/jake/Dropbox/classes/3rd Year/Teaching/ECON210_Fall2017/Data/psid-base.zip"

# Collecting information about dataset without reading whole thing
df.temp = read.table(unz(fpath, "psid-base.csv"), nrows=2, header=T, sep=",")
ncols = dim(df.temp)[2]

cols.all = colnames(df.temp)

# <INPUT Required> Insert desired variables here
yvars = c("inc_labor", "edu", "hhchildren", "married", "fam_inc", "works", "age", "state", "workexp", "f_edu", "m_edu") # variables with year suffix
# yvars = c("inc_labor", "edu", "hhchildren", "married", "fam_inc", "works", "age", "m_edu", "f_edu", "state", "workexp")
ivars = c("birthyear", "f_id", "black", "white", "id") # variables without year suffix (keep birthyear and f_id)

years = c(1968:1997, seq(1999, 2011, 2))
cols = ivars
for (y in years){
    for (v in yvars) {
        if (paste(v, y, sep="") %in% cols.all) {
            cols = c(cols, paste(v, y, sep=""))
        }
    }
}

if (length(match(cols, cols.all)) != length(cols)) {
    print('Some columns not found, check columns')
}

cols.arg = rep("NULL", ncols)
for (c in 1:ncols) {
    if (c %in% match(cols, cols.all)){
        cols.arg[c] = "numeric"
    }
}

# Reading in whole dataset for selected columns
df.yr = read.table(unz(fpath, "psid-base.csv"), header=T, sep=",",
    colClasses=cols.arg)
cols.all = colnames(df.yr)

You can use this dataset above. Below, I will construct a dataset that has variables by age instead of by year.

# Converting Variables from by Year to by Age

In [2]:
df.yr.cc = df.yr[complete.cases(df.yr$birthyear), ]

In [3]:
# <INPUT Required> Desired age range
ages = 25:50

df.age = df.yr.cc[,ivars]
for (v in yvars[yvars!="age"]) {
    for (a in ages){
        avar = paste(v, "_age", a, sep="")
        df.age[,avar] = NaN
        for (y in years) {
            yvar = paste(v, y, sep="")
            if (yvar %in% cols.all) {
                df.age[df.age$birthyear==y - a, avar] = df.yr.cc[df.yr.cc$birthyear==y - a, yvar]
            }
        }
    }
}

# Reshaping Age and Year Datasets into Panel Datasets
## Panel Data set by Age

In [4]:
library(data.table)

df.age.panel = melt(as.data.table(df.age), 
     measure = patterns(sapply(yvars[yvars!="age"], function(y){paste(c("^", y, "_age"), collapse="")}, USE.NAMES=F)), ## identify columns by patterns
     value.name = yvars[yvars!="age"], variable.name="age")
df.age.panel$age = factor(df.age.panel$age, labels=ages)
df.age.panel = as.data.frame(df.age.panel)

In [5]:
# first 10 observations with complete cases in yvars
df.age.panel[complete.cases(df.age.panel[,yvars]),][1:10,]

Unnamed: 0,birthyear,f_id,black,white,id,age,inc_labor,edu,hhchildren,married,fam_inc,works,state,workexp,f_edu,m_edu
1,1970,1335001,0,1,1335030,25,41340.0,12,0,1,41652.0,1,1,1,9,11
2,1955,1335001,0,1,1335006,25,46240.0,12,1,1,46240.0,1,24,2,10,11
5,1962,1335001,0,1,1335010,25,42000.0,11,2,1,58170.0,1,24,4,9,11
6,1958,1335001,0,1,1335008,25,11950.0,12,1,1,22705.0,1,24,3,10,11
7,1959,1335001,0,1,1335009,25,27571.6,12,2,1,29380.7,1,24,1,10,11
10,1964,1335001,0,1,1335011,25,8640.0,14,0,0,8640.0,1,21,4,9,11
11,1967,1411001,1,0,1411013,25,27582.5,16,2,0,32908.6,1,29,5,11,11
12,1966,1411001,1,0,1411012,25,24500.0,12,0,0,24500.0,0,29,6,11,11
15,1954,1411001,1,0,1411005,25,30176.0,14,0,1,55104.0,1,29,5,12,11
16,1962,1411001,1,0,1411010,25,37590.0,13,1,1,85940.4,1,29,7,11,11


## Panel Data Set by Year

In [6]:
df.yr.full = df.yr

for (y in years) {
    for (v in yvars) {
        if (!(paste(v, y, sep="") %in% cols.all)) {
            df.yr.full[, paste(v, y, sep="")] = NaN
        }
    }
}
df.yr.panel = melt(as.data.table(df.yr.full), 
     measure = patterns(sapply(yvars, function(y){paste("^",y,sep="")}, USE.NAMES=F)), ## identify columns by patterns
     value.name = yvars, variable.name="year")
df.yr.panel$year = factor(df.yr.panel$year, labels=years)
df.yr.panel = as.data.frame(df.yr.panel)

In [7]:
# first 10 observations with complete cases in yvars
df.yr.panel[complete.cases(df.yr.panel[,yvars]),][1:10,]

Unnamed: 0,id,birthyear,f_id,black,white,year,inc_labor,edu,hhchildren,married,fam_inc,works,age,state,workexp,f_edu,m_edu
3959,2808001,1923,897001,0,1,1968,51300,12,0,1,80712.0,1,44,34,32,6,8
35080,1342001,1924,1343001,0,1,1968,47880,10,1,1,48222.0,1,44,36,31,5,8
35776,1694001,1934,1694002,0,1,1968,34884,12,0,0,57264.48,1,33,2,22,8,8
39201,2392001,1941,2391001,0,1,1968,23940,8,4,1,30711.6,1,26,45,13,5,7
41435,6652001,1944,6651001,0,1,1968,44460,15,0,0,44460.0,1,24,21,12,14,12
41919,1210001,1928,1212001,0,1,1968,82080,10,2,1,82080.0,1,39,31,25,12,12
42684,57001,1933,56001,0,1,1968,42408,11,3,1,64296.0,1,35,39,23,3,3
69163,1485001,1943,1046001,0,1,1968,47880,12,2,1,51833.52,1,24,21,13,7,12
73144,2019001,1935,2020001,0,1,1968,68400,12,4,1,68400.0,1,33,6,21,12,16
75223,242003,1945,242001,0,1,1969,27907,12,0,1,34397.0,1,24,41,12,8,8
