initial commit

hilaryparker · Jan 30, 2013 · 2196269 · 2196269
commit 2196269
Show file tree

Hide file tree

Showing 28 changed files with 528 additions and 0 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,22 @@
+# Auto detect text files and perform LF normalization
+* text=auto
+
+# Custom for Visual Studio
+*.cs     diff=csharp
+*.sln    merge=union
+*.csproj merge=union
+*.vbproj merge=union
+*.fsproj merge=union
+*.dbproj merge=union
+
+# Standard to msysgit
+*.doc	 diff=astextplain
+*.DOC	 diff=astextplain
+*.docx diff=astextplain
+*.DOCX diff=astextplain
+*.dot  diff=astextplain
+*.DOT  diff=astextplain
+*.pdf  diff=astextplain
+*.PDF	 diff=astextplain
+*.rtf	 diff=astextplain
+*.RTF	 diff=astextplain
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,163 @@
+#################
+## Eclipse
+#################
+
+*.pydevproject
+.project
+.metadata
+bin/
+tmp/
+*.tmp
+*.bak
+*.swp
+*~.nib
+local.properties
+.classpath
+.settings/
+.loadpath
+
+# External tool builders
+.externalToolBuilders/
+
+# Locally stored "Eclipse launch configurations"
+*.launch
+
+# CDT-specific
+.cproject
+
+# PDT-specific
+.buildpath
+
+
+#################
+## Visual Studio
+#################
+
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+
+# User-specific files
+*.suo
+*.user
+*.sln.docstates
+
+# Build results
+[Dd]ebug/
+[Rr]elease/
+*_i.c
+*_p.c
+*.ilk
+*.meta
+*.obj
+*.pch
+*.pdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.vspscc
+.builds
+*.dotCover
+
+## TODO: If you have NuGet Package Restore enabled, uncomment this
+#packages/
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opensdf
+*.sdf
+
+# Visual Studio profiler
+*.psess
+*.vsp
+
+# ReSharper is a .NET coding add-in
+_ReSharper*
+
+# Installshield output folder
+[Ee]xpress
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish
+
+# Others
+[Bb]in
+[Oo]bj
+sql
+TestResults
+*.Cache
+ClientBin
+stylecop.*
+~$*
+*.dbmdl
+Generated_Code #added for RIA/Silverlight projects
+
+# Backup & report files from converting an old project file to a newer
+# Visual Studio version. Backup files are not needed, because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+
+
+
+############
+## Windows
+############
+
+# Windows image file caches
+Thumbs.db
+
+# Folder config file
+Desktop.ini
+
+
+#############
+## Python
+#############
+
+*.py[co]
+
+# Packages
+*.egg
+*.egg-info
+dist
+build
+eggs
+parts
+bin
+var
+sdist
+develop-eggs
+.installed.cfg
+
+# Installer logs
+pip-log.txt
+
+# Unit test / coverage reports
+.coverage
+.tox
+
+#Translations
+*.mo
+
+#Mr Developer
+.mr.developer.cfg
+
+# Mac crap
+.DS_Store
diff --git a/README.markdown b/README.markdown
@@ -0,0 +1,13 @@
+# Analysis of poisoned names 
+
+This project is more completely described on [my blog post](http://hilaryparker.com/2013/01/30/hilary-the-most-poisoned-baby-name-in-us-history/). 
+
+For access to the code for scraping the data from the social security administration baby names website, look in the `munge` and `lib` directories.
+
+For access to the data that has been pulled from the website and formatted as (rows=names, columns=years), go to the `cache` directory.
+
+For the code for the analysis, go to the `src` directory.
+
+I organized the code using [ProjectTemplate](http://projecttemplate.net/), an R package that provides a systematic template for organizing code. ProjectTemplate also allows for easy loading of the project.
+
+Have fun!!
diff --git a/cache/.gitignore b/cache/.gitignore
diff --git a/cache/female.nums.RData b/cache/female.nums.RData
diff --git a/cache/female.percents.RData b/cache/female.percents.RData
diff --git a/cache/female.ranks.RData b/cache/female.ranks.RData
diff --git a/cache/male.nums.RData b/cache/male.nums.RData
diff --git a/cache/male.percents.RData b/cache/male.percents.RData
diff --git a/cache/male.ranks.RData b/cache/male.ranks.RData
diff --git a/config/.gitignore b/config/.gitignore
diff --git a/config/global.dcf b/config/global.dcf
@@ -0,0 +1,8 @@
+data_loading: on
+cache_loading: on
+munging: off
+logging: off
+load_libraries: on
+libraries: RCurl, XML, RColorBrewer
+as_factors: on
+data_tables: off
diff --git a/graphs/.gitignore b/graphs/.gitignore
diff --git a/graphs/bigdrop.png b/graphs/bigdrop.png
diff --git a/graphs/bigdrops.pdf b/graphs/bigdrops.pdf
diff --git a/graphs/more_names_trimmed.png b/graphs/more_names_trimmed.png
diff --git a/graphs/names.png b/graphs/names.png
diff --git a/graphs/names_trimmed.png b/graphs/names_trimmed.png
diff --git a/lib/getNames.R b/lib/getNames.R
@@ -0,0 +1,81 @@
+# Function for retrieving names from the SSA Website #
+# set the years you'd like to examine #
+# min year = 1880, max year = 2011 #
+# number="p" gives percent, number="n" gives raw number. Rank is always given #
+# female=TRUE female names, FALSE male names
+
+# returns two matrices -- one with the raw number or percentages, and one with the ranks #
+
+getNames<-function(year.ind=seq(1950,2011),number="p",female=TRUE){
+  nametable<-list()
+  names<-c(NA)
+
+  # scrape from website #
+  for(i in 1:length(year.ind)){
+    # get the data from the website as a POST form #
+    raw <- postForm("http://www.ssa.gov/cgi-bin/popularnames.cgi",year=year.ind[i],top=1000,number=number,style="post")
+
+    # read the HTML output into an R table
+    nametable[[i]] <- cbind(readHTMLTable(raw,which=3)[-1001,],"Year"=rep(year.ind[i],1000))
+
+    # keep a vector with just the female names for creating results matrix next
+    if(female==TRUE){
+      names<-c(names,as.character(nametable[[i]]$"Female name"))
+    }
+    if(female==FALSE){
+      names<-c(names,as.character(nametable[[i]]$"Male name"))
+    }
+  }
+  names<-names[-1]
+
+  # unique names from all of the years you looked at
+  unique.names<-unique(names)
+
+  # create results matrix, rows are the unique names from all years, columns are the years #
+  names.mat<-matrix(nrow=length(unique.names),ncol=length(year.ind))
+  rownames(names.mat)<-unique.names 
+  nms<-rep(NA,length(year.ind))
+  for(i in 1:length(year.ind)){
+    nms[i]<-as.character(year.ind[i])
+  }
+  colnames(names.mat)<-nms
+  ranks.mat<-names.mat
+
+
+  for(i in 1:length(year.ind)){
+    if(female==TRUE){
+      temp.names<-as.character(nametable[[i]]$"Female name")
+      # need to replace commas and extract numbers#
+      if(number=="n"){
+        temp.nums<-as.numeric(gsub(",","",as.character(nametable[[i]]$"Number of females")))
+      }
+      if(number=="p"){
+        temp.nums<-as.numeric(gsub("%","",as.character(nametable[[i]]$"Percent oftotal females")))
+      }
+    }
+    if(female==FALSE){
+      temp.names<-as.character(nametable[[i]]$"Male name")
+      # need to replace commas and extract numbers#
+      if(number=="n"){
+        temp.nums<-as.numeric(gsub(",","",as.character(nametable[[i]]$"Number of males")))
+      }
+      if(number=="p"){
+        temp.nums<-as.numeric(gsub("%","",as.character(nametable[[i]]$"Percent oftotal males")))
+      }
+    }
+	temp.ranks<-1:1000
+
+    # match matrix column to the year index #
+    ind<-match(unique.names,temp.names)
+
+    # go thru every unique name and fill in that year's data for that name #
+    for(j in 1:length(ind)){
+      if(!is.na(ind[j])){
+        names.mat[j,i]<-temp.nums[ind[j]]
+		ranks.mat[j,i]<-temp.ranks[ind[j]]
+      }
+    }
+  }
+  res<-list(names.mat,ranks.mat)
+  return(res)
+}
diff --git a/munge/.gitignore b/munge/.gitignore
diff --git a/munge/01-A-scrapingdata.R b/munge/01-A-scrapingdata.R
@@ -0,0 +1,22 @@
+# Gathering data from SSA website using getNames function #
+# ranks will be the same from each iteration, only take ranks from the first one
+
+tmp <- getNames(year.ind=seq(1880,2011),number="p",female=TRUE)
+female.percents <- tmp[[1]]
+female.ranks <- tmp[[2]]
+ProjectTemplate::cache("female.percents")
+ProjectTemplate::cache("female.ranks")
+
+tmp <- getNames(year.ind=seq(1880,2011),number="p",female=FALSE)
+male.percents <- tmp[[1]]
+male.ranks <- tmp[[2]]
+ProjectTemplate::cache("male.percents")
+ProjectTemplate::cache("male.ranks")
+
+tmp <- getNames(year.ind=seq(1880,2011),number="n",female=TRUE)
+female.nums <- tmp[[1]]
+ProjectTemplate::cache("female.nums")
+
+tmp <- getNames(year.ind=seq(1880,2011),number="n",female=FALSE)
+male.nums <- tmp[[1]]
+ProjectTemplate::cache("male.nums")
diff --git a/reports/.gitignore b/reports/.gitignore
diff --git a/reports/bigdrops.csv b/reports/bigdrops.csv
@@ -0,0 +1,15 @@
+"name","loss","yearlost"
+"Clementine","69","1881"
+"Celestine","67","1881"
+"Minna","68","1883"
+"Dewey","74","1899"
+"Deneen","72","1965"
+"Katina","69","1974"
+"Catina","74","1974"
+"Farrah","78","1978"
+"Renata","69","1981"
+"Infant","67","1991"
+"Iesha","69","1992"
+"Hilary","70","1993"
+"Khadijah","72","1995"
+"Ashanti","68","2003"
diff --git a/reports/bigdrops.xlsx b/reports/bigdrops.xlsx
diff --git a/src/.gitignore b/src/.gitignore
diff --git a/src/analysis-scraping.Rout b/src/analysis-scraping.Rout
@@ -0,0 +1,62 @@
+
+R version 2.15.2 Patched (2012-10-28 r61038) -- "Trick or Treat"
+Copyright (C) 2012 The R Foundation for Statistical Computing
+ISBN 3-900051-07-0
+Platform: x86_64-unknown-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+  Natural language support but running in an English locale
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> setwd("/home/bst/student/hiparker/names")
+> library('ProjectTemplate')
+Loading required package: testthat
+> load.project()
+Loading project configuration
+Autoloading helper functions
+ Running helper script: getNames.R
+Autoloading packages
+ Loading package: RCurl
+Loading required package: bitops
+ Loading package: XML
+Autoloading data
+ Loading cached data set: female.nums
+ Loading cached data set: female.percents
+ Loading cached data set: female.ranks
+ Loading cached data set: male.nums
+ Loading cached data set: male.percents
+ Loading cached data set: male.ranks
+Munging data
+ Running preprocessing script: 01-A-scrapingdata.R
+> 
+> 
+> #tmp1
+> 
+> #dim(tmp1)
+> #tmp2<-tmp1[,-1]
+> #for(i in 1:dim(tmp1)[2]-1){
+> #  tmp2[,i]<-tmp1[,i+1]/tmp1[,i]
+> #}
+> #min(tmp2,na.rm=TRUE)
+> #which(tmp2<0.5,arr.ind=TRUE)
+> #tmp2
+> 
+> #?lapply
+> 
+> #names.mat["Hilary",]
+> #names.mat["Hillary",]
+> 
+> 
+> proc.time()
+    user   system  elapsed 
+ 623.337    0.805 1249.528