In [85]:
import pandas as pd
import numpy as np
import re

In [86]:
xlsx = pd.ExcelFile('data/meat_statistics.xlsx')

The original excel file was designed for human readable, including merged cells for the first category (a.k.a Commerical vs. Federally Inspected below) and then individual cells for the secondary category (a.k.a row 0). 

In [80]:
raw_data = pd.read_excel(xlsx, sheet_name = 'RedMeatPoultry_Prod-Full', header = 1)
raw_data.head()

Unnamed: 0,Type 1/,Commercial 2/,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Federally inspected,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,,Beef 3/,Veal 3/,Pork 3/,Lamb and mutton 3/,,Total red meat 3/ 4/,Beef 3/,Veal 3/,Pork 3/,Lamb and mutton 3/,Total red meat 3/ 4/,Broilers 5/,Other chicken 5/,Turkey 5/,Total poultry 4/ 5/ 6/,Total red meat and poultry 4/,
1,Jan-Sep 2022,21237.3,40.2,20075.5,98.6,,41451.6,20893.5,39.2,19983.1,87.6,41003.3,34348.1,424.116,3912.59,38787.7,79791,
2,Jan-Sep 2021,20842.1,38.8,20489.8,103.1,,41473.9,20490.1,37.7,20388.7,90.5,41007.3,33706.1,407.63,4192.11,38397.9,79405.2,
3,Sep-2022,2390.9,4.2,2259.4,10.7,,4665.2,2350.5,4.1,2247.7,9.6,4611.8,4003.83,45.102,420.551,4481.28,9093.08,
4,Aug-2022,2506.5,4.7,2303.3,11.7,,4826.1,2466.8,4.5,2291.1,10.3,4772.8,4210.98,55.285,449.156,4727.37,9500.17,


There are two types - Commerical vs. Federally Inspected. Their numbers are pretty close. I decided to use the numbers under Federally Inspected because it contains more information in terms of meat types. 

In [81]:
idx = list(raw_data.columns).index('Federally inspected')
idxs = [0]
for i in range(len(raw_data.columns)):
    if i >= idx:
        idxs.append(i)

In [82]:
raw_data = raw_data.iloc[:, idxs]
raw_data.head()

Unnamed: 0,Type 1/,Federally inspected,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,,Beef 3/,Veal 3/,Pork 3/,Lamb and mutton 3/,Total red meat 3/ 4/,Broilers 5/,Other chicken 5/,Turkey 5/,Total poultry 4/ 5/ 6/,Total red meat and poultry 4/,
1,Jan-Sep 2022,20893.5,39.2,19983.1,87.6,41003.3,34348.1,424.116,3912.59,38787.7,79791,
2,Jan-Sep 2021,20490.1,37.7,20388.7,90.5,41007.3,33706.1,407.63,4192.11,38397.9,79405.2,
3,Sep-2022,2350.5,4.1,2247.7,9.6,4611.8,4003.83,45.102,420.551,4481.28,9093.08,
4,Aug-2022,2466.8,4.5,2291.1,10.3,4772.8,4210.98,55.285,449.156,4727.37,9500.17,


Replace the current header with the first row (a.k.a the secondary categories) and remove the empty column

In [83]:
new_header = raw_data.iloc[0, :-1]
meat_prod = raw_data.iloc[1:, :-1]
meat_prod.columns = new_header
meat_prod.head()

Unnamed: 0,NaN,Beef 3/,Veal 3/,Pork 3/,Lamb and mutton 3/,Total red meat 3/ 4/,Broilers 5/,Other chicken 5/,Turkey 5/,Total poultry 4/ 5/ 6/,Total red meat and poultry 4/
1,Jan-Sep 2022,20893.5,39.2,19983.1,87.6,41003.3,34348.1,424.116,3912.59,38787.7,79791.0
2,Jan-Sep 2021,20490.1,37.7,20388.7,90.5,41007.3,33706.1,407.63,4192.11,38397.9,79405.2
3,Sep-2022,2350.5,4.1,2247.7,9.6,4611.8,4003.83,45.102,420.551,4481.28,9093.08
4,Aug-2022,2466.8,4.5,2291.1,10.3,4772.8,4210.98,55.285,449.156,4727.37,9500.17
5,Jul-2022,2214.4,4.0,1960.5,8.9,4187.8,3684.82,45.709,393.913,4134.36,8322.16


Transform the header by removing space and notation, explained below
- 1/ Excludes slaughter on farms.																
- 2/ Production in federally inspected and other plants.															
- 3/ Based on packers' dressed weights.																
- 4/ Totals may not add due to rounding.																
- 5/ Ready-to-cook.																
- 6/ Includes geese, guineas, ostriches, emus, rheas, squab, and other poultry.																

In [84]:
current_header = meat_prod.columns[1:] 
transformed_header = ['Month']
for i in current_header:
    transformed_header.append(word.lower())
meat_prod.columns = transformed_header
meat_prod.head()

Unnamed: 0,Month,beef_,veal_,pork_,lamb_and_mutton_,total_red_meat__,broilers_,other_chicken_,turkey_,total_poultry___,total_red_meat_and_poultry_
1,Jan-Sep 2022,20893.5,39.2,19983.1,87.6,41003.3,34348.1,424.116,3912.59,38787.7,79791.0
2,Jan-Sep 2021,20490.1,37.7,20388.7,90.5,41007.3,33706.1,407.63,4192.11,38397.9,79405.2
3,Sep-2022,2350.5,4.1,2247.7,9.6,4611.8,4003.83,45.102,420.551,4481.28,9093.08
4,Aug-2022,2466.8,4.5,2291.1,10.3,4772.8,4210.98,55.285,449.156,4727.37,9500.17
5,Jul-2022,2214.4,4.0,1960.5,8.9,4187.8,3684.82,45.709,393.913,4134.36,8322.16
