* This is a simple R script to make converting your columns a bit<br> easier

In [12]:
library(dplyr)
library(ggplot2)
library(tidyr)

In [2]:
# Data-only data frame (with no headers):
nych17 <- read.csv('../nycHousing2017.csv', skip = 2, header = FALSE)
# Temporary data frame from which to extract the first row of headers: tmp <- read.csv(my.file, header = TRUE)
tmp <- read.csv('../nycHousing2017.csv', header = TRUE)
# Use headers from tmp for nych17:
names(nych17) <- names(tmp)
# Remove the temporary data frame:
rm(tmp)

head(nych17)

recordtype,borough,X_d12,X_d3,X_d4,X_d5,X_d6,X_e1,X_e2,X_e3,⋯,FW77,FW78,FW79,FW80,IL30PER,IL50PER,IL80PER,year,sba,geo_id2
1,1,9,9,9,1,9,9,9,9,⋯,11446240,32161256,9745805,13739972,21800,36250,58000,2017,101,3603710
1,1,9,9,9,1,9,9,9,9,⋯,19594332,8525261,36330660,28940264,27200,45300,72500,2017,101,3603710
1,1,9,9,9,1,9,9,9,9,⋯,19594332,8525261,36330660,28940264,24500,40800,65250,2017,101,3603710
1,1,9,9,9,1,9,9,9,9,⋯,57167296,43401080,10664017,66729560,21800,36250,58000,2017,101,3603710
1,1,9,9,9,1,9,9,9,9,⋯,70148672,42706528,10137630,64932696,21800,36250,58000,2017,101,3603710
1,1,9,9,9,1,9,9,9,9,⋯,37264384,20239716,6243940,29680360,21800,36250,58000,2017,101,3603710


In [8]:
codebook1 <- data.frame(borough = c(1, 2, 3, 4, 5),
                       boroughName = c("Bronx", "Brooklyn", "Manhattan",
                                       "Queens", "Staten Island"))

codebook2 <- data.frame(X_32a = c(0, 1, 8),
                       heating_breakdown = c('Yes', 'No', 'Not Reported'))

codebook3 <- data.frame(X_32b = c(2, 3, 4, 5, 8, 9),
                       num_heat_breakdowns = c("1", "2", "3",
                                        "4+", "Not Reported",
                                        "No Breakdowns"))

In [11]:
# The following is a function to change numerical values to appropriate
# categorical (named) values.
#
# The inputs are as follows:
# orig_df - This is the unaltered dataframe from your .csv import (df object)
# codebook_df - This is dataframe that represents your 'dictionary' (df object)
# orig_name - the original name of the column (string)
# new_name - a new (meaningful) name you would like for the column
# 
# Note that old_name and new_name, have to match the column names specified in your 
# codebook dataframe object

rf_func_year <- function(orig_df, 
                                codebook_df, 
                                orig_name,
                                new_name) {
    
    df <- left_join(x = codebook_df, y = orig_df, by = orig_name)
    df <- select(df, -c(orig_name))
    colnames(df)[colnames(df) == orig_name] <- new_name
    df <- select(df, new_name, year)
    return(df)
}

rf_func <- function(orig_df, 
                                codebook_df, 
                                orig_name,
                                new_name) {
    
    df <- left_join(x = codebook_df, y = orig_df, by = orig_name)
    df <- select(df, -c(orig_name))
    colnames(df)[colnames(df) == orig_name] <- new_name
    df <- select(df, new_name)
    return(df)
}



df1 <- rf_func(nych17, codebook1, 'borough', 'boroughName')
df2 <- rf_func(nych17, codebook2, 'X_32a', 'heating_breakdown')
df3 <- rf_func_year(nych17, codebook3, 'X_32b', 'num_heat_breakdowns')

df <- cbind(df1, df2, df3)

head(df)

boroughName,heating_breakdown,num_heat_breakdowns,year
Bronx,Yes,1,2017
Bronx,Yes,1,2017
Bronx,Yes,1,2017
Bronx,Yes,1,2017
Bronx,Yes,1,2017
Bronx,Yes,1,2017
