# Census Data Preparation
This notebook takes the raw .tsv file from ICPSR 35206 "United States Agriculture Data, 1840 - 2012" and transforms into the data used in the [data_cleaning.ipynb](./data_cleaning.ipynb) notebook.

The "year.txt" files have the variables that we want in the analysis; in each year, we grab only these variables from the .tsv files and save the subset to a .csv file.  Thus to add more variables to the dataset it creates we just add to those files.

In [1]:
import pandas as pd
import numpy as np
import zipfile as z
import os
import re
import glob

with open("data_dir.txt") as f:
    data_dir = f.read()

In [None]:
files_dir = data_dir + "agcensus/"

#os.chdir('../../../raw_data/agcensus/')
filelist = {"1920.txt":"1920.tsv","1925.txt":"1925.tsv",
            "1930_01.txt":"1930_01.tsv","1930_02.txt":"1930_02.tsv",
            "1935.txt":"1935.tsv","1940_01.txt":"1940_01.tsv", 
            "1940_02.txt":"1940_02.tsv"}

agcensus = pd.DataFrame()

# For every file, we are going to read and only keep the columns specified.
for v,d in filelist.items():
    print('reading ' + v + ' and ' + d)
    var = pd.read_csv(files_dir+ v, header =None)
    data = pd.read_table(files_dir+d)
    for i in list(data.columns.values):
        if i.upper() in list(var.iloc[:,0]):
            row = var.loc[var.iloc[:,0] == i.upper()].index.tolist()[0]
            name = var.iloc[row,2].strip()
            data = data.rename(columns = {i:name})
        else:    
            data = data.drop(i, axis=1)
    data['year'] = float(re.split('[_|.]',v)[0])
    title=v.split('.')[0] + '.csv'
    data.to_csv(files_dir+ v.split('.')[0] + '.csv', index=False)
    print("Wrote %s to %s" %(v,title))
    #os.rename("./"+d, "./original/"+d)
print("Done writing")

csvlist = [i.replace('t','c') for i in list(filelist.values())]
csvlist = [i.replace('1930_*','1930.') for i in csvlist]    

In [None]:
# Merge 1930 and 1940 datasets together

s1940_01 = pd.read_csv(files_dir+'1940_01.csv')
s1940_02 = pd.read_csv(files_dir+'1940_02.csv')
s1940 = pd.merge(s1940_01,s1940_02)
s1940.to_csv(files_dir+'1940.csv', index=False)

s1930_01 = pd.read_csv(files_dir+'1930_01.csv')
s1930_02 = pd.read_csv(files_dir+'1930_02.csv')
s1930 = pd.merge(s1930_01,s1930_02)
s1930.to_csv(files_dir+'1930.csv', index=False)


In [None]:
# Now merge all years together.

csvlist = [files_dir+x+".csv" for x in ["1920","1925","1930","1935","1940"]]
agsurvey = pd.DataFrame()
for cv in csvlist:
    data = pd.read_csv(cv)
    data.set_index(['FIPS', 'year'], inplace=True)
    agsurvey = agsurvey.append(data)
agsurvey.sort_index().reset_index().to_csv(data_dir+'clean_data/agcensus_20-40.csv')