## Renewable and Total Energy Production by State

**Data Source: US Energy Information Administration (EIA) – State Energy Data System (SEDS)**

Website: https://www.eia.gov/renewable/data.php 

PDF url: https://www.eia.gov/state/seds/sep_prod/SEDS_Production_Report.pdf

In [1]:
pip install pdfplumber

Note: you may need to restart the kernel to use updated packages.


In [2]:
import re
import requests
import pdfplumber
from io import BytesIO
import pandas as pd
import numpy as np

In [17]:
def extract_pdf(pdf_url, start_page, end_page):
    response = requests.get(pdf_url)
    
    if response.status_code == 200:
        file = BytesIO(response.content)
        
        with pdfplumber.open(file) as pdf:
            all_tables = []
            
            for i in range(start_page, end_page):
                one_page = []  
                count = 0  
                
                page = pdf.pages[i]
                text = page.extract_text()
                
                entries = re.findall(r'NA|\(s\)|\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b', text)
                
                cleaned_entries = [float(num.replace(',', '')) if num not in ['NA', '(s)'] else num for num in entries]
                
                one_page.extend(cleaned_entries)
                count += len(cleaned_entries)

                one_page = one_page[:354]
                
                reshaped_data = np.array(one_page).reshape(59, 6)
                
                all_tables.append(reshaped_data)
                
            return all_tables  

    else:
        return "Something went wrong"

pdf_url = 'https://www.eia.gov/state/seds/sep_prod/SEDS_Production_Report.pdf'

data = extract_pdf(pdf_url, 17, 119)

data[101]

array([['35.2', '198.0', '776.7', '0.0', 'NA', '1.6'],
       ['2.0', '1013.0', '56.7', '257.1', '802.2', '0.0'],
       ['NA', '1.6', '3.0', '1120.0', '63.8', '265.4'],
       ['779.9', '0.0', 'NA', '1.5', '3.0', '1113.0'],
       ['62.4', '261.8', '790.6', '0.0', 'NA', '1.4'],
       ['2.0', '1118.0', '66.5', '270.9', '836.7', '0.0'],
       ['NA', '1.6', '3.0', '1179.0', '80.0', '330.9'],
       ['898.7', '0.0', 'NA', '1.5', '3.0', '1314.0'],
       ['125.5', '369.1', '930.0', '0.0', 'NA', '1.6'],
       ['3.0', '1429.0', '139.9', '413.2', '859.1', '0.0'],
       ['NA', '1.6', '4.0', '1418.0', '189.9', '414.7'],
       ['812.1', '0.0', 'NA', '1.3', '4.0', '1422.0'],
       ['275.6', '394.3', '823.1', '0.0', 'NA', '1.5'],
       ['4.0', '1498.0', '386.1', '352.1', '812.0', '0.0'],
       ['NA', '1.5', '4.0', '1556.0', '434.6', '320.1'],
       ['788.5', '0.0', 'NA', '1.6', '3.0', '1548.0'],
       ['562.9', '339.3', '778.1', '0.0', 'NA', '1.7'],
       ['3.0', '1685.0', '829.7', '338

In [28]:
states = ["Alabama", "Alaska", "Arizona", "Arkansas", 
          "California", "Colorado", "Connecticut", 
          "Delaware", "District of Columbia", "Florida", 
          "Georgia", "Hawaii", "Idaho", "Illinois", 
          "Indiana", "Iowa", "Kansas", "Kentucky", 
          "Louisiana", "Maine", "Maryland", "Massachusetts", 
          "Michigan", "Minnesota", "Mississippi", "Missouri", 
          "Montana", "Nebraska", "Nevada", "New Hampshire", 
          "New Jersey", "New Mexico", "New York", "North Carolina", 
          "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", 
          "Rhode Island", "South Carolina", "South Dakota", "Tennessee",
          "Texas", "Utah", "Vermont", "Virginia", "Washington", 
          "West Virginia", "Wisconsin", "Wyoming"]

physical_units = []
thermal_units = []

for j in range(len(data)):
    df = pd.DataFrame(data[j])
    
    if j % 2 == 0: 
        state_index = j//2
        df.insert(0, 'State', states[state_index])
        physical_units.append(df)

    else:
        state_index = (j-1)//2
        df.insert(0, 'State', states[state_index])
        thermal_units.append(df)


physical_units_df = pd.concat(physical_units, ignore_index=True)

physical_units_df.insert(1, 'Units', "Physical")
physical_units_df.rename(
    columns={
            0: 'Coal (K short tons)', 
            1: 'Natural Gas (M cubic ft)',
            2: 'Crude Oil (K barrels)',
            3: 'Fuel Ethanol (K barrels)',
            4: 'Biodiesel (K barrels)',
            5: 'Renewable Diesel (K barrels)'
        }, inplace=True)


thermal_units_df = pd.concat(thermal_units, ignore_index=True)    

thermal_units_df.insert(1, 'Units', "Thermal")
thermal_units_df.rename(
        columns={
            0: 'Coal (T Btu)', 
            1: 'Natural Gas (T Btu)',
            2: 'Crude Oil (T Btu)',
            3: 'Fuel Ethanol (T Btu)',
            4: 'Biodiesel (T Btu)',
            5: 'Renewable Diesel (T Btu)'
        }, inplace=True)

physical_units_df.to_csv('renewable_energy_data(physical units).csv', index = False)
thermal_units_df.to_csv('renewable_energy_data(thermal units).csv', index = False)

pd.read_csv('renewable_energy_data(physical units).csv')
pd.read_csv('renewable_energy_data(thermal units).csv')

Unnamed: 0,State,Units,Coal (T Btu),Natural Gas (T Btu),Crude Oil (T Btu),Fuel Ethanol (T Btu),Biodiesel (T Btu),Renewable Diesel (T Btu)
0,Alabama,Thermal,318.8,0.1,42.5,0.0,,45.7
1,Alabama,Thermal,21.0,428.0,363.4,0.3,46.8,0.0
2,Alabama,Thermal,,47.6,24.0,482.0,348.4,0.4
3,Alabama,Thermal,46.6,0.0,,49.1,23.0,468.0
4,Alabama,Thermal,379.5,0.4,42.6,0.0,,49.1
...,...,...,...,...,...,...,...,...
3004,Wyoming,Thermal,0.7,0.9,4.0,8989.0,7019.8,1811.9
3005,Wyoming,Thermal,300.3,0.0,0.7,2.4,6.0,9141.0
3006,Wyoming,Thermal,7740.0,1990.7,307.2,0.0,0.7,2.1
3007,Wyoming,Thermal,6.0,10046.0,7847.6,2231.3,313.9,0.0
