In [None]:
import pandas as pd
from scipy.interpolate import splrep, splev
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [None]:
# Spline Interpolation

import matplotlib.pyplot as plt

def interpolate_spline(df, column_name):
    df.sort_values(by='date', inplace=True)
    df.reset_index(drop=True, inplace=True)

    df['date'] = pd.to_datetime(df['date'])

    # group by date
    grouped = df.groupby('date_only')

    for date, group in grouped:

        # Print information about the group
        # print(f"\n------ grouped : {date} ------\n{group}")

        # first case - if it's the first row, take the closest not-nan future value
        first_row_index = group.index[0]
        # print(f"\n- first case - \n{group.loc[first_row_index, column_name]}")
        # print(f"\tfirst case nan check - {group.loc[first_row_index, column_name]}")
        if pd.isna(group.loc[first_row_index, column_name]):
            next_non_nan_index = group[column_name].first_valid_index()
            # print(f"\tnext_non_nan - {next_non_nan_index}\n{df.loc[next_non_nan_index, column_name]}")
            df.loc[first_row_index, column_name] = df.loc[next_non_nan_index, column_name]
            # print(f"first row index : {first_row_index}, interpolated : {df.loc[first_row_index, column_name]}")

        # second case - if it's the last row, take the closest not-nan past value
        last_row_index = group.index[-1]
        # print(f"\n- last case - \n\t{group.loc[last_row_index, column_name]}")
        # print(f"\tlast case nan check - {group.loc[last_row_index, column_name]}")
        if pd.isna(group.loc[last_row_index, column_name]):
            prev_non_nan_index = group[column_name].last_valid_index()
            # print(f"\tprev_non_nan - {prev_non_nan_index}\n{df.loc[prev_non_nan_index, column_name]}")
            df.loc[last_row_index, column_name] = df.loc[prev_non_nan_index, column_name]

    # group by date
    grouped = df.groupby('date_only')

    for date, group in grouped:

        # third case - the remaining rows, spline interpolate
        non_nan_indices = group.index[~group[column_name].isna()]
        spline_x = non_nan_indices.astype(float).values
        spline_y = group.loc[non_nan_indices, column_name].values
        spline = splrep(spline_x, spline_y, k=3)

        nan_indices = group.index[group[column_name].isna()]
        # print(f"nan indices : {nan_indices}")

        if len(nan_indices) > 0:
            spline_values = splev(nan_indices.astype(float).values, spline)

            # Plot the spline graph
            plt.figure()
            plt.plot(spline_x, spline_y, 'o', label='Original points')

            # Calculate spline curve for more x values
            finer_x = np.linspace(spline_x.min(), spline_x.max(), 100)
            finer_y = splev(finer_x, spline)

            plt.plot(finer_x, finer_y, '-', label='Spline curve')
            plt.plot(nan_indices, spline_values, 'x', label='Interpolated values')

            plt.title(f"Spline Interpolation for {column_name} on {date}")
            plt.legend()
            plt.show()

            # Update the dataframe with interpolated values
            df.loc[nan_indices, column_name] = spline_values

    print(f"final df : \n{df}")
    # return the final df
    return df


In [None]:
# Linear Interpolation

import matplotlib.pyplot as plt

def interpolate_linear(df, column_name):
  # print(f'original df : {df.shape}')
  df.sort_values(by='date', inplace=True)
  df.reset_index(drop=True, inplace=True)

  # group by date
  grouped = df.groupby('date_only')

  for date, group in grouped:
    # Print information about the group
    # print(f"******** date : {date} ********")
    # print(df[df['date_only']==date])
    # print("----------------------")

    if group[column_name].isnull().all():
      df = df.drop(df[df['date_only'] == date].index)
      # print(f"df after drop :\n {df.shape}")
      continue

    else:
      first_row_index = group.index[0]

      if pd.isna(group.loc[first_row_index, column_name]):
        # print(f"\n- first case -\n")
        # print(f"\tfirst case nan check - {group.loc[first_row_index, column_name]}")
        next_non_nan_index = group[column_name].first_valid_index()
        # print(f"\tnext_non_nan_index - {group.loc[next_non_nan_index, column_name]}")
        df.loc[first_row_index, column_name] = df.loc[next_non_nan_index, column_name]
        # print(f"df after replace :\n {df[df['date_only']==date]}")

      # second case - if it's the last row, take the closest not-nan past value
      last_row_index = group.index[-1]

      if pd.isna(group.loc[last_row_index, column_name]):
        # print(f"\n- second case - \n")
        # print(f"\tsecond case nan check - {group.loc[last_row_index, column_name]}")
        prev_non_nan_index = group[column_name].last_valid_index()

        # print(f"\tprev_non_nan_index - {group.loc[prev_non_nan_index, column_name]}")
        df.loc[last_row_index, column_name] = df.loc[prev_non_nan_index, column_name]
        # print(f"df after replace :\n {df[df['date_only']==date]}")

  grouped = df.groupby('date_only')

  for date, group in grouped:

    # third case - the remaining rows, spline interpolate
    nan_indices = group.index[group[column_name].isna()]
    non_nan_indices = group.index[~group[column_name].isna()]

    if len(nan_indices) > 0:
      # print(f"\n- third case - \n")
      # print(f"\tthird case nan check - {group.loc[nan_indices, column_name]}")
      df[column_name] = df[column_name].interpolate(method='linear')
      # print(f"df after replace :\n {df[df['date_only']==date]}")


  # print(f'after {column_name} : {df.shape}')
  return df

In [None]:
generator_set = ["samcheonpo","yeongheung","yeongdong","gumi","gwangyanghang","dusan","gyeongsangUni","yecheon","goheungman"]

In [None]:
# handle columns with more than 40% of none values, and linear interpolate every other.

for generator in generator_set:
  file_path = '/content/drive/MyDrive/하늘/data files/averaged/'+generator+'.csv'
  path = '/content/drive/MyDrive/하늘/data files/nullcheck_interpolated/'+generator+'.csv'
  df = pd.read_csv(file_path)
  print(f"***** file : {generator}{df.shape} *****")

  for feature in df.columns:
    null_ratio = df[feature].isnull().mean()
    # print(f"{feature} - {null_ratio}")

    if null_ratio > 0.4:
        df = df.drop(columns=[feature])
        # print(f"{feature} is dropped. df shape{df.shape}")

  print(f"\nafter null check - {df.columns}\n")

# interpolate missing values using linear interpolation
  for feature in df.columns:
    linear_df = interpolate_linear(df,feature)
    df = linear_df
    # print(f"interpolate {generator}, {df.shape}")

  # print(f"\nfor {generator}, final df : {df.shape}\n")
  df.to_csv(path)

***** file : samcheonpo(7845, 14) *****

after null check - Index(['date', 'date_only', 'temperature', 'wind speed', 'humidity',
       'air pressure', 'sunshine', 'cloud', 'ground temperature',
       'total_weighted_average_power', 'power_plant'],
      dtype='object')

***** file : yeongheung(7793, 14) *****

after null check - Index(['date', 'date_only', 'temperature', 'wind speed', 'humidity',
       'air pressure', 'sunshine', 'insolation', 'cloud', 'ground temperature',
       'total_weighted_average_power', 'power_plant'],
      dtype='object')

***** file : yeongdong(7644, 14) *****

after null check - Index(['date', 'power_plant', 'total_weighted_average_power', 'date_only',
       'temperature', 'wind speed', 'humidity', 'air pressure', 'sunshine',
       'insolation', 'cloud', 'ground temperature'],
      dtype='object')

***** file : gumi(7547, 14) *****

after null check - Index(['date', 'power_plant', 'total_weighted_average_power', 'date_only',
       'temperature', 'wi

In [None]:
# check if any null value is left

for generator in generator_set:
  path = '/content/drive/MyDrive/하늘/data files/nullcheck_interpolated/'+generator+'.csv'
  df = pd.read_csv(path)
  # print(df.isnull().any())
  # print(df.shape)

(7779, 13)
(7779, 13)
(7644, 13)
(7547, 12)
(7529, 13)
(7516, 13)
(7330, 13)
(7096, 12)
(665, 12)
