In [1]:
import pandas as pd

In [35]:
input_file = r'Final\HVDC_Processed.csv'
data = pd.read_csv(input_file)
#data=data[data['HVDC_NAME']=='MINVIS1']
#print(len(data))
duplicated_data = data[data.duplicated(subset=['RUN_TIME'], keep=False)]
print(f"Total duplicated rows: {len(duplicated_data)}")
data = data.drop_duplicates('RUN_TIME', keep='last')
data.dropna(inplace=True, how='all')
print(len(data))
print(duplicated_data)

Total duplicated rows: 0
210240
Empty DataFrame
Columns: [RUN_TIME, FLOW_MIN, FLOW_VIS, FLOW_LUZ]
Index: []


In [26]:
input_file = r'D:\School\ADMU\4Y\SEM 1\MATH 199.11\Final\LMP_Complete.csv'
output_file = 'tryLWAP.csv'

# Define chunk size (adjust based on your system's capacity)
chunk_size = 500000

# Columns to retain (drop unnecessary columns during reading)
usecols = ['RUN_TIME', 'RESOURCE_TYPE', 'REGION_NAME', 'LMP', 'SCHED_MW']

# Initialize an empty list to hold processed chunks
chunks = []

# Read and process file in chunks
for chunk in pd.read_csv(input_file, usecols=usecols, chunksize=chunk_size, parse_dates=['RUN_TIME']):
    # Filter rows where RESOURCE_TYPE is 'G'
    chunk = chunk[chunk['RESOURCE_TYPE'] == 'NL']
    
    # Set negative LMP prices to 0
    chunk.loc[chunk['LMP'] < 0, 'LMP'] = 0
    
    # Append processed chunk to the list
    chunks.append(chunk)

# Concatenate all processed chunks into one DataFrame
data = pd.concat(chunks, ignore_index=True)

# Group by REGION_NAME and RUN_TIME with 5-minute frequency
grouped = data.groupby(['REGION_NAME', pd.Grouper(key='RUN_TIME', freq='5min')])

# Calculate the weighted average (GWAP) for each group
def calculate_weighted_avg(x):
    total_sched_mw = x['SCHED_MW'].sum()
    if total_sched_mw == 0:
        return pd.Series({'LWAP': 0})  # Handle division by zero
    return pd.Series({
        'LWAP': (x['LMP'] * x['SCHED_MW']).sum() / total_sched_mw
    })

# Apply the function to the group and exclude the grouping columns from the operation
result = grouped.apply(calculate_weighted_avg).reset_index()

# Create a complete time index for each region
min_time = data['RUN_TIME'].min()
max_time = data['RUN_TIME'].max()
complete_time_index = pd.date_range(start=min_time, end=max_time, freq='5min')

# Reindex the DataFrame to include all time periods and fill missing values with 0
reindexed_chunks = []
for region in result['REGION_NAME'].unique():
    region_data = result[result['REGION_NAME'] == region]
    region_data = region_data.set_index('RUN_TIME').reindex(complete_time_index, fill_value=0).reset_index()
    region_data['REGION_NAME'] = region
    region_data.columns = ['REGION_NAME', 'RUN_TIME','LWAP' ]
    reindexed_chunks.append(region_data)

# Concatenate all reindexed chunks into one DataFrame
final_result = pd.concat(reindexed_chunks, ignore_index=True)

# Reorder columns to REGION, RUNTIME, Price
final_result = final_result[['LWAP', 'RUN_TIME', 'REGION_NAME']]


# Write the result to the output file in chunks
final_result.to_csv(output_file, index=False, chunksize=50000)

print("Processing complete.")

  result = grouped.apply(calculate_weighted_avg).reset_index()


Processing complete.
