/
business-logic.R
182 lines (164 loc) · 6.83 KB
/
business-logic.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
library(gdata)
library(caret)
library(rjson)
# Business logic for the application that investigates how poverty
# can be explained by factors outside human control
# read.data
# return
# data.frame with one row per country and with the following columns:
# Country.Code: ISO 3166-1 alpha-3 (cca3) country code
# Region: Country continent
# Avg.Temperature: Average annual temperature
# GDP: mean GDP over the years provided in the data
# Poverty: mean poverty rate over the years provided in the data
# Percent.Natural.Resources: mean natural resource rent as a percentage over the years provided in the data
# Natural.Resources: Percent.Natural.Resources multiplied by GDP
# Landlocked: whether the country is land-locked
# Data references
# GDP per capita, Poverty index:
# http://databank.worldbank.org/data/reports.aspx?Code=NY.GDP.PCAP.CD&id=af3ce82b&report_name=Popular_indicators&populartype=series&ispopular=y#
# Temperature data:
# http://data.worldbank.org/data-catalog/cckp_historical_data
# Natural Resource Rent data:
# http://data.worldbank.org/indicator/NY.GDP.TOTL.RT.ZS
# Landlocked status
# http://mledoze.github.io/countries/
read.data = function() {
# economic data
df.econ = read.csv('Popular indicators/Popular indicators_Data.csv')
# average columns over all years
ss = subset(x = df.econ, select = grep('^X', colnames(df.econ)))
df.econ$mean = rowMeans(ss, na.rm = TRUE)
# get GDP
gdp.per.capita = subset(x = df.econ,
subset = (Series.Name == 'GDP per capita (current US$)'))
gdp.per.capita = gdp.per.capita[,c('Country.Code', 'mean')]
names(gdp.per.capita)= c('Country.Code', 'GDP')
# get Poverty index
poverty = subset(x = df.econ,
subset = (Series.Name == 'Poverty headcount ratio at national poverty lines (% of population)'))
poverty = poverty[,c('Country.Code', 'mean')]
names(poverty)[names(poverty) == 'mean'] <- 'Poverty'
# Temperature data
temp.df = read.xls(xls = 'cckp_historical_data_0.xls', sheet = 4, na.strings = 'NA')
temp.df = temp.df[,c('ISO_3DIGIT', 'Annual_temp')]
names(temp.df) = c('Country.Code', 'Avg.Temperature')
# Natural resources
nr.df = read.csv('ny.gdp.totl.rt.zs_Indicator_en_csv_v2/ny.gdp.totl.rt.zs_Indicator_en_csv_v2.csv', skip=4)
# average over all years
ss = subset(x = nr.df,
select = grep('^X', colnames(nr.df)))
nr.df$Percent.Natural.Resources = rowMeans(ss, na.rm = TRUE)
nr.df = nr.df[,c('Country.Code', 'Percent.Natural.Resources')]
# read regional data
json_file <- "mledoze-countries-6757eef/countries.json"
json_data <- fromJSON(paste(readLines(json_file), collapse=""))
region.df = data.frame(Country.Code =
sapply(json_data, function(row) row$cca3),
Region=sapply(json_data, function(row) row$region),
Landlocked=sapply(json_data, function(row) row$landlocked))
# combine into one frame
comb.df = merge(gdp.per.capita, temp.df,
by='Country.Code', all=FALSE, sort=TRUE)
comb.df = merge(comb.df, poverty,
by='Country.Code', all=FALSE, sort=TRUE)
comb.df = merge(comb.df, nr.df,
by='Country.Code', all=FALSE, sort=TRUE)
comb.df = merge(comb.df, region.df,
by='Country.Code', all=FALSE, sort=TRUE)
# generate final data frame
comb.df$Natural.Resources = comb.df$Percent.Natural.Resources * comb.df$GDP
# if there's no GDP, the country is not usable
comb.df = comb.df[complete.cases(comb.df$GDP),]
return (comb.df)
}
# predictor.info
# data.frame containing information for each
predictor.info = data.frame(
row.names = c('Avg.Temperature', 'log(Natural.Resources)', 'Landlocked'),
label = c('Average Annual Temperature',
'log Natural Resources',
'Land-locked'),
vertical.label = c('Average\nAnnual\nTemperature',
'log\nNatural\nResources',
'Land-locked')
)
# get.possible.predictors
# return: all possible predictors for this application
get.possible.predictors = function() {
p = row.names(predictor.info)
names(p) = predictor.info$label
return (p)
}
# get.predictor.label
# param: pred, a predictor, one of the row.names of predictor.info
# return: a label, possibly wide
get.predictor.label = function(pred) {
return (predictor.info[pred,]$label)
}
# get.predictor.vertical.label
# param: pred, a predictor, one of the row.names of predictor.info
# return: a label, made to be narrower than the common label
get.predictor.vertical.label = function(pred) {
return (predictor.info[pred,]$vertical.label)
}
# what.is.not.in
# params:
# vec: a vector
# possibilities: all possibilities we want to check vec against
# return: all items of vec that are not in possibilities
what.is.not.in = function(vec, possibilities) {
return (vec[sapply(vec, function(p) !(p %in% possibilities))])
}
# get.formula
# param: predictors to include in the model
# return: a formula to predict log(mean.gdp) with the predictors
get.formula = function(predictors){
if(length(what.is.not.in(predictors, get.possible.predictors())) > 0) {
stop(simpleError(paste('unknown predictors', predictors)))
}
formula = paste('log(GDP) ~', paste(predictors,collapse = '+'))
return (formula)
}
# get.model
# params:
# formula.str: a formula as string
# df: the data
# return: a model, with the data preprocessed by standardizing
get.model = function(formula.str, df, seed=1) {
if(!is.na(seed)) {
set.seed(seed)
}
fit = train(as.formula(formula.str), data=df, method='lm',
preProcess = c('center', 'scale'))
return (fit)
}
# percent.explained
# param: a model
# partition the R^2 among the various predictors
# (careful: this may depend on the order of predictors.
# It only really works if the predictors are orthogonal.
# In this application, they appear to be)
# return: a data.frame with the anova results and an additional
# column (PctExp) that provides the percentage of variation explained
# by the predictors and residuals
percent.explained = function(model) {
an <- anova(model)
anss <- an$"Sum Sq"
pe = cbind(an,PctExp=anss/sum(anss)*100)
# row.names(pe) = c('Annual Temperature', 'Natural Resources', 'Landlocked', 'Residuals')
return (pe)
}
# get.region.names
# return: all possible region names
get.region.names = function() {
c("Africa", "Americas", "Asia", "Europe", "Oceania")
}
# filter.data.by.region
# params:
# df: a data.frame
# regions: region names
# return: the rows of df for the regions
filter.data.by.region = function(df, regions) {
return (subset(df, subset = Region %in% regions))
}