In [None]:
from src.ThesisProject.CF_table.CF_interval_detect import detect_and_plot
import os
from src.DatabaseGetInfo import DatabasePlotter as dbPlot
from src.DatabaseGetInfo import DatabaseAnalyzer
import sqlite3
import pandas as pd
from src.GLOBAL_VARS import NotTrustedPlants

# Database path
path = os.path.abspath('/home/wheatley/WFD/wfd.db')
analyzer = DatabaseAnalyzer.WindFarmAnalyzer(path)
plotter = dbPlot

#wf_id_query = """select wf_id from wf where wf.city = 'İZMİR' and wf.license_status = 'Yürürlükte' order by wf_id"""
wf_id_query = """select wf_id from wf where wf.license_status = 'Yürürlükte' order by wf_id"""
wf_ids = sqlite3.connect(path).execute(wf_id_query).fetchall()
wf_ids = [wf_id[0] for wf_id in wf_ids]

#wf_ids = wf_ids[:3]
#wf_ids = [264]

start_date = '2020-01-01'
end_date = '2024-01-01'

trends = {}
data_completeness = {}

#NotTrustedPlants = []

# print(len(wf_ids))
all_data = pd.DataFrame()

for wf_id in wf_ids:
    if wf_id in NotTrustedPlants:
        continue
    productions_filtered = detect_and_plot(wf_id, start_date, end_date, plot=False)
    try:
        #---------------------------------------------------------------------------------------------------------------------------------------
        # Get the row number and find out how many years does the data contain (each row is 1 hour so in the data which has 4 years max has 4*365*24 rows).
        number_of_rows = len(productions_filtered)
        completeness = number_of_rows / (4 * 365 * 24)

        # same thing with the start and end dates
        date_diff = productions_filtered['timestamp'].iloc[-1] - productions_filtered['timestamp'].iloc[0]
        # convert timedelta to days
        date_diff = date_diff / pd.Timedelta(hours=1)
        date_diff = date_diff / (4 * 365 * 24)

        #print(f"Number of rows: {number_of_rows}, completeness_from_rows: {completeness}, completeness_from_dates: {date_diff}")

        # put the wf_id and year in the year_completeness dict
        data_completeness[wf_id] = completeness, date_diff
        #---------------------------------------------------------------------------------------------------------------------------------------
    except TypeError:
        print(f"Error for wf_id {wf_id}")
    try:
        all_data = pd.concat([all_data, productions_filtered], ignore_index=True)
    except TypeError:
        print(f"Error for wf_id {wf_id}: {productions_filtered}")
        continue

print(data_completeness)

In [None]:
# write all data_completeness values to csv: wf_id, 1st value, 2nd value
with open('data_completeness.csv', 'w') as f:
    for key in data_completeness.keys():
        f.write("%s,%s,%s\n"%(key,data_completeness[key][0],data_completeness[key][1]))

In [None]:
all_data2 = all_data.copy()

all_data2 = all_data2.drop(columns=['period', 'below_threshold', 'capacity_threshold', 'installed_capacity', 'availability_percentage'])

# Define aggregation functions for different columns
agg_functions = {
    'capacity_factor': 'mean',  # Take mean of capacity factor
    'capacity_factor_ava': 'mean',
    'production': 'sum',        # Sum production values
    'ws100': 'mean',
    'availability': 'sum'
    # Add more columns as needed
}

# Apply different aggregations to different columns
all_data2 = all_data2.groupby('timestamp').agg(agg_functions).reset_index()

#all_data2 = all_data2.drop(columns=['trend'])
#drop na on in the capacity factor column
all_data2 = all_data2.dropna(subset=['capacity_factor'])
all_data2 = all_data2.sort_values(by='timestamp')
all_data2 = all_data2.reset_index(drop=True)
all_data2['timestamp'] = pd.to_datetime(all_data2['timestamp'])

In [None]:
all_data2.to_csv('CF_data_HH.csv', index=True)

all_data2 = all_data2.set_index('timestamp')
all_data2 = all_data2.resample('MS').mean()

all_data2.to_csv('CF_data_MS.csv', index=True)

GRAPH


In [None]:
import pandas as pd
CF_data_hh = pd.read_csv('CF_data_HH.csv', index_col=0)

CF_data_hh

In [None]:
import pandas as pd
CF_data_ms = pd.read_csv('CF_data_MS.csv', index_col=0)

CF_data_ms

In [None]:
import pandas as pd
CF_data_qe = pd.read_csv('CF_data_QE.csv', index_col=0)

CF_data_qe

In [None]:
def plot_capacity_factor_trend(df, cf_column='capacity_factor', timestamp_column=None,
                              title_suffix="", figsize=(12, 8)):
    """
    Plot capacity factor with trendline and calculate percentage change per year.

    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing capacity factor data
    cf_column : str, default 'capacity_factor'
        Name of the capacity factor column
    timestamp_column : str, optional
        Name of the timestamp column. If None, uses the index
    title_suffix : str, default ""
        Additional text to add to the plot title
    figsize : tuple, default (12, 8)
        Figure size as (width, height)

    Returns:
    --------
    dict : Dictionary containing analysis results:
        - 'annual_percentage_change': Annual percentage change rate
        - 'initial_cf': Initial capacity factor value
        - 'final_cf': Final capacity factor value
        - 'total_change': Total change over the period
        - 'time_span_years': Duration in years
        - 'r_squared': R² value of the trendline
        - 'p_value': Statistical significance
        - 'slope': Slope of the trendline
    """
    import matplotlib.pyplot as plt
    import numpy as np
    from scipy import stats
    import pandas as pd

    # Make a copy to avoid modifying the original
    data = df.copy()

    # Handle timestamp column
    if timestamp_column is None:
        # Use index as timestamp
        if not isinstance(data.index, pd.DatetimeIndex):
            data.index = pd.to_datetime(data.index)
        timestamps = data.index
    else:
        # Use specified column as timestamp
        data[timestamp_column] = pd.to_datetime(data[timestamp_column])
        timestamps = data[timestamp_column]

    # Get capacity factor values
    cf_values = data[cf_column].dropna()

    # Align timestamps with non-null capacity factor values
    if timestamp_column is None:
        timestamps = cf_values.index
    else:
        timestamps = data.loc[cf_values.index, timestamp_column]

    # Create the plot
    fig, ax1 = plt.subplots(figsize=figsize)

    # Plot capacity factor
    ax1.plot(timestamps, cf_values, 'b-', linewidth=2, label='Capacity Factor', alpha=0.7)
    ax1.scatter(timestamps, cf_values, color='blue', alpha=0.6, s=30)

    # Calculate trendline
    x_numeric = np.arange(len(cf_values))
    slope, intercept, r_value, p_value, std_err = stats.linregress(x_numeric, cf_values)

    # Generate trendline
    trendline = slope * x_numeric + intercept
    ax1.plot(timestamps, trendline, 'r--', linewidth=2, label=f'Trendline (R² = {r_value**2:.3f})')

    # Calculate percentage decrease per year
    time_span_years = (timestamps[0] - timestamps[0]).days / 365.25
    timestamps[0]   # ✅ Correct way to access first element
    timestamps[-1]  # ✅ Correct way to access last element

    # Determine the frequency of data (for annual conversion)
    time_diff = timestamps[1] - timestamps[0]
    if time_diff.days <= 1:
        # Daily data
        periods_per_year = 365.25
    elif time_diff.days <= 7:
        # Weekly data
        periods_per_year = 52
    elif time_diff.days <= 31:
        # Monthly data
        periods_per_year = 12
    elif time_diff.days <= 93:
        # Quarterly data
        periods_per_year = 4
    else:
        # Annual data or other
        periods_per_year = 1

    annual_slope = slope * periods_per_year
    initial_cf = cf_values.iloc[0]
    percentage_change_per_year = (annual_slope / initial_cf) * 100

    # Set labels and title
    ax1.set_xlabel('Date', fontsize=12)
    ax1.set_ylabel('Capacity Factor', fontsize=12, color='blue')

    main_title = f'Capacity Factor Over Time{title_suffix}'
    subtitle = f'Annual Change Rate: {percentage_change_per_year:.2f}%'
    ax1.set_title(f'{main_title}\n{subtitle}', fontsize=14)

    ax1.tick_params(axis='y', labelcolor='blue')
    ax1.grid(True, alpha=0.3)
    ax1.legend()

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45)

    # Add statistics text box
    textstr = f'''Statistics:
Initial CF: {initial_cf:.3f}
Final CF: {cf_values.iloc[-1]:.3f}
Total Change: {(cf_values.iloc[-1] - initial_cf):.3f}
Annual Change Rate: {percentage_change_per_year:.2f}%/year
Time Period: {time_span_years:.1f} years
R²: {r_value**2:.3f}
p-value: {p_value:.4f}'''

    props = dict(boxstyle='round', facecolor='wheat', alpha=0.8)
    ax1.text(0.02, 0.98, textstr, transform=ax1.transAxes, fontsize=10,
             verticalalignment='top', bbox=props)

    plt.tight_layout()
    plt.show()

    # Return analysis results
    return {
        'annual_percentage_change': percentage_change_per_year,
        'initial_cf': initial_cf,
        'final_cf': cf_values.iloc[-1],
        'total_change': cf_values.iloc[-1] - initial_cf,
        'time_span_years': time_span_years,
        'r_squared': r_value**2,
        'p_value': p_value,
        'slope': slope
    }

results = plot_capacity_factor_trend(CF_data_qe, title_suffix=" (Quarterly Data)")
results = plot_capacity_factor_trend(CF_data_ms, title_suffix=" (Monthly Data)")
results = plot_capacity_factor_trend(CF_data_hh, title_suffix=" (Hourly Data)")

# All these (rates) are wrong since not all plants have the whole 4 year dataset.