# Get the data
This notebook will download each of the stackoverflow surveys from 2011 to 2020, extract the data, and clean up afterwards. 

When all is done the `/data/` directory will contain a csv for each year. 

In [1]:
import requests
from zipfile import ZipFile
import os

In [2]:
os.makedirs('./data/archives', mode = 0o777, exist_ok = True)

In [3]:
# specifiy urls of the data archives
data = {
    '2020': "https://drive.google.com/uc?export=download&id=1dfGerWeWkcyQ9GX9x20rdSGj7WtEpzBB",
    '2019': "https://drive.google.com/uc?export=download&id=1QOmVDpd8hcVYqqUXDXf68UMDWQZP0wQV",
    '2018': "https://drive.google.com/uc?export=download&id=1_9On2-nsBQIw3JiY43sWbrF8EjrqrR4U",
    '2017': "https://drive.google.com/uc?export=download&id=0B6ZlG_Eygdj-c1kzcmUxN05VUXM",
    '2016': "https://drive.google.com/uc?export=download&id=0B0DL28AqnGsrV0VldnVIT1hyb0E",
    '2015': "https://drive.google.com/uc?export=download&id=0B0DL28AqnGsra1psanV1MEdxZk0",
    '2014': "https://drive.google.com/uc?export=download&id=0B0DL28AqnGsrempjMktvWFNaQzA",
    '2013': "https://drive.google.com/uc?export=download&id=0B0DL28AqnGsrenpPNTc5UE1PYW8",
    '2012': "https://drive.google.com/uc?export=download&id=0B0DL28AqnGsrX3JaZWVwWEpHNWM",
    '2011': "https://drive.google.com/uc?export=download&id=0Bx0LyhBTBZQgUGVYaGx3SzdUQ1U",
}

In [4]:
# download the data archives
for year, url in data.items():
    print(f"getting {year} from {url}")
    r = requests.get(url, allow_redirects=True)
    open(f'data/archives/{year}.zip', 'wb').write(r.content)

getting 2020 from https://drive.google.com/uc?export=download&id=1dfGerWeWkcyQ9GX9x20rdSGj7WtEpzBB
getting 2019 from https://drive.google.com/uc?export=download&id=1QOmVDpd8hcVYqqUXDXf68UMDWQZP0wQV
getting 2018 from https://drive.google.com/uc?export=download&id=1_9On2-nsBQIw3JiY43sWbrF8EjrqrR4U
getting 2017 from https://drive.google.com/uc?export=download&id=0B6ZlG_Eygdj-c1kzcmUxN05VUXM
getting 2016 from https://drive.google.com/uc?export=download&id=0B0DL28AqnGsrV0VldnVIT1hyb0E
getting 2015 from https://drive.google.com/uc?export=download&id=0B0DL28AqnGsra1psanV1MEdxZk0
getting 2014 from https://drive.google.com/uc?export=download&id=0B0DL28AqnGsrempjMktvWFNaQzA
getting 2013 from https://drive.google.com/uc?export=download&id=0B0DL28AqnGsrenpPNTc5UE1PYW8
getting 2012 from https://drive.google.com/uc?export=download&id=0B0DL28AqnGsrX3JaZWVwWEpHNWM
getting 2011 from https://drive.google.com/uc?export=download&id=0Bx0LyhBTBZQgUGVYaGx3SzdUQ1U


In [5]:
# specify csv files within the archives
files = {
    '2020': 'survey_results_public.csv',
    '2019': 'survey_results_public.csv',
    '2018': 'survey_results_public.csv',
    '2017': 'survey_results_public.csv',
    '2016': '2016 Stack Overflow Survey Results/2016 Stack Overflow Survey Responses.csv',
    '2015': '2015 Stack Overflow Developer Survey Responses.csv',
    '2014': '2014 Stack Overflow Survey Responses.csv',
    '2013': '2013 Stack Overflow Survey Responses.csv',
    '2012': '2012 Stack Overflow Survey Results.csv',
    '2011': '2011 Stack Overflow Survey Results.csv',
}

In [6]:
# extract the csv files and rename them to the year the survey was conducted
for year, csv_filename in files.items():
    zip_filename = f"data/archives/{year}.zip"
    with ZipFile(zip_filename, 'r') as zip: 
        # print(zip.namelist())
        print(f"extracting {csv_filename} from {zip_filename} and renaming to {year}.csv")
        zip.extract(csv_filename)
        os.rename(csv_filename,f'data/{year}.csv')

extracting survey_results_public.csv from data/archives/2020.zip and renaming to 2020.csv
extracting survey_results_public.csv from data/archives/2019.zip and renaming to 2019.csv
extracting survey_results_public.csv from data/archives/2018.zip and renaming to 2018.csv
extracting survey_results_public.csv from data/archives/2017.zip and renaming to 2017.csv
extracting 2016 Stack Overflow Survey Results/2016 Stack Overflow Survey Responses.csv from data/archives/2016.zip and renaming to 2016.csv
extracting 2015 Stack Overflow Developer Survey Responses.csv from data/archives/2015.zip and renaming to 2015.csv
extracting 2014 Stack Overflow Survey Responses.csv from data/archives/2014.zip and renaming to 2014.csv
extracting 2013 Stack Overflow Survey Responses.csv from data/archives/2013.zip and renaming to 2013.csv
extracting 2012 Stack Overflow Survey Results.csv from data/archives/2012.zip and renaming to 2012.csv
extracting 2011 Stack Overflow Survey Results.csv from data/archives/201

In [7]:
# cleanup a stray directory
os.rmdir('2016 Stack Overflow Survey Results')

In [8]:
# done