In [1]:
from tika import parser 
import re
import dateparser
from datetime import datetime as dt, timedelta as td
import numpy as np
import pandas as pd
from glob import glob
import os

In [2]:
alt_state_names = {
"Andaman and Nicobar Islands": ["Andaman and Nicobar Islands",],
"Andhra Pradesh": ["Andhra Pradesh",],
"Arunachal Pradesh": ["Arunachal Pradesh","Arunachal  Pradesh"],
"Assam": ["Assam",],
"Bihar": ["Bihar",],
"Chandigarh": ["Chandigarh",],
"Chhattisgarh": ["Chhattisgarh",],
"Dadra and Nagar Haveli": ["Dadra and Nagar Haveli","DNH",],
"Daman and Diu": ["Daman and Diu","DD",],
"Delhi": ["Delhi", "The NCT of Delhi",],
"Goa": ["Goa",],
"Gujarat": ["Gujarat",],
"Haryana": ["Haryana",],
"Himachal Pradesh": ["Himachal Pradesh","HP"],
"Jammu and Kashmir": ["Jammu and Kashmir","J&K","J&K(UT) & Ladakh(UT)", "J&K(UT) and Ladakh(UT)", "J&K(UT) &"],
"Jharkhand": ["Jharkhand",],
"Karnataka": ["Karnataka",],
"Kerala": ["Kerala",],
"Lakshadweep": ["Lakshadweep",],
"Madhya Pradesh": ["Madhya Pradesh","MP"],
"Maharashtra": ["Maharashtra",],
"Manipur": ["Manipur",],
"Meghalaya": ["Meghalaya",],
"Mizoram": ["Mizoram",],
"Nagaland": ["Nagaland",],
"Odisha": ["Odisha",],
"Puducherry": ["Puducherry","Pondy"],
"Punjab": ["Punjab",],
"Rajasthan": ["Rajasthan",],
"Sikkim": ["Sikkim",],
"Tamil Nadu": ["Tamil Nadu",],
"Telangana": ["Telangana",],
"Tripura": ["Tripura",],
"Uttar Pradesh": ["Uttar Pradesh","UP",],
"Uttarakhand": ["Uttarakhand",],
"West Bengal": ["West Bengal",],
"AMNSIL": ["AMNSIL","Essar steel"],
"Damodar Valley Corporation": ["Damodar Valley Corporation", "DVC",],
}

In [3]:
dates = []

unknown_states = []


peak_met = {s:[] for s in alt_state_names}
peak_unmet = {s:[] for s in alt_state_names}
daily_energy_met = {s:[] for s in alt_state_names}

for d in [peak_met, peak_unmet, daily_energy_met]:
	d.update({"Total":[]})

In [4]:
fnames =  glob('../../demand/posoco/raw-scripts/index.php?p=Daily+Report%2FPSP*') + glob('../../demand/posoco/raw-scripts/index.html?wpdmdl=*')

In [5]:
print(fnames[:10])

['../../demand/posoco/raw-scripts/index.php?p=Daily+Report%2FPSP+Report%2F2022-2023%2FDecember+2022&dl=07.12.22_NLDC_PSP.pdf', '../../demand/posoco/raw-scripts/index.php?p=Daily+Report%2FPSP+Report%2F2022-2023%2FDecember+2022&dl=08.12.22_NLDC_PSP.pdf', '../../demand/posoco/raw-scripts/index.php?p=Daily+Report%2FPSP+Report%2F2022-2023%2FDecember+2022&dl=09.12.22_NLDC_PSP.pdf', '../../demand/posoco/raw-scripts/index.php?p=Daily+Report%2FPSP+Report%2F2022-2023%2FDecember+2022&dl=10.12.22_NLDC_PSP.pdf', '../../demand/posoco/raw-scripts/index.php?p=Daily+Report%2FPSP+Report%2F2022-2023%2FDecember+2022&dl=11.12.22_NLDC_PSP.pdf', '../../demand/posoco/raw-scripts/index.php?p=Daily+Report%2FPSP+Report%2F2022-2023%2FDecember+2022&dl=12.12.22_NLDC_PSP.pdf', '../../demand/posoco/raw-scripts/index.php?p=Daily+Report%2FPSP+Report%2F2022-2023%2FDecember+2022&dl=13.12.22_NLDC_PSP.pdf', '../../demand/posoco/raw-scripts/index.php?p=Daily+Report%2FPSP+Report%2F2022-2023%2FDecember+2022&dl=14.12.22_NLDC_P

In [6]:
from tqdm.auto import tqdm 

In [7]:
for fname in tqdm(fnames, desc='Processing files', unit='file'):

	try:
		raw = parser.from_file(fname)
	except:
		tqdm.write("could not open file")
		os.remove(fname)
		continue

	dump = raw['content']
	
	
		

	#print(dump)


	if dump is None:
		tqdm.write("file empty")
		os.remove(fname)
		continue
	
	if ("don't have permission" in dump) or ("File not found or deleted from server" in dump):
		tqdm.write("404 file")
		os.remove(fname)
		continue
	
	if 'xmpTPg:NPages' not in raw['metadata']:
		tqdm.write("not pdf")
	
	elif type(raw['metadata']['xmpTPg:NPages'])==list:
		tqdm.write("multi-document file, skipping")
		os.remove(fname)
		continue
	
	'''
	elif int(raw['metadata']['xmpTPg:NPages'])>3: 
		tqdm.write("file too long")
		os.remove(fname)
		continue
	'''

	
	if "NR WR SR ER" not in dump:
		tqdm.write("wrong file layout")
		os.rename(fname, fname.replace("raw-scripts/", "raw-scripts/rejected/"))
		
		continue

	try:
		find_datelike = re.findall('\d{1,2}-[a-zA-Z]{3}-\d{2,4}\\n', dump)
	except:
		tqdm.write("unable to find date info")
		continue

	if find_datelike:
		date_report = dateparser.parse(find_datelike[0])
		date = date_report-td(days=1)

		#tqdm.write(str(date))

	else:
		continue



	encoded_string = dump.encode("ascii", "ignore")
	dump = encoded_string.decode()
	
	
	#quick fix in case the table is garbled between Maharshtra and Goa
	catch = re.findall('(?:Maharashtra.*\n)([\S\s]*)(?:Goa)',dump)
	if catch: 
		if catch[0] not in ["\n", "\r"]:
			dump = dump.replace(catch[0],"")

	catch = re.findall('(?:DD.*\n)([\S\s]*)(?:DNH)',dump)
	if catch: 
		if catch[0] not in ["\n", "\r"]:
			dump = dump.replace(catch[0],"")
	

	find_data_block = re.findall('Punjab(?:[\S\s]*)Tripura.*(?:\r?\n(?!\r?\n).*)*', dump, re.MULTILINE)


	if find_data_block:
		find_data_block[0] = find_data_block[0].replace('J&K(UT) & \n\nLadakh(UT)\n', 'J&K(UT) & Ladakh(UT) ')
		for region in ["NER ","ER ","SR ","WR ","NR "]:
			find_data_block[0] = find_data_block[0].replace(region, "")
		
		#print(find_data_block[0])
		
		state_strings = find_data_block[0].replace("\n\n","\n").split("\n")
			
		try:
			table = np.array([re.split("\s(?=[\d-])", line) for line in state_strings])
			table[table=='-']=np.nan
			unfound_states = list(alt_state_names.keys())
		except:
			tqdm.write("Error parsing table; skipping")
			continue
		
		dates.append(date)			

		peak_met['Total'].append(np.sum([float(i[1]) for i in table]))
		peak_unmet['Total'].append(np.sum([float(i[2]) for i in table]))
		daily_energy_met['Total'].append(np.sum([float(i[3]) for i in table]))



		for line in table:
			#print(line)
			state_name_raw = line[0]
			for k,v in alt_state_names.items():
				if state_name_raw in v:
					
							
					peak_met[k].append(float(line[1]))
					peak_unmet[k].append(float(line[2]))
					daily_energy_met[k].append(float(line[3]))

					unfound_states.remove(k)

					break
			else:
				unknown_states.append(state_name_raw)

		for k in unfound_states:
			peak_met[k].append(np.nan)
			peak_unmet[k].append(np.nan)
			daily_energy_met[k].append(np.nan)


	else:
		tqdm.write("unable to find state energy info")
		continue

Processing files:   0%|          | 0/4277 [00:00<?, ?file/s]

Error parsing table; skipping
Error parsing table; skipping


In [8]:
for e, n in zip([peak_met, peak_unmet, daily_energy_met], ['peak_met_MW','peak_unmet_MW','daily_energy_met_MU']):

	df = pd.DataFrame.from_dict(e)
	df.insert(0, 'Date', dates)
	df.sort_values(by=['Date'], inplace=True,)
	df.to_csv("outputs/{}.csv".format(n), index=False)
