In [6]:
#file to determine the percentage of missing values and to impute missing values if less than 10% for a column
import os
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Folder path containing the data files
folder_path = 'D:/internship 2022/data/regression_variables/variables_ts_clean_data_avail/'

# Get the list of file names in the folder
file_names = os.listdir(folder_path)

# List to store the imputed DataFrames
imputed_data = []

In [7]:

# Iterate through each file in the folder
for file_name in file_names:
    file_path = os.path.join(folder_path, file_name)
    reservoir_name = os.path.splitext(file_name)[0]
    
    # Load the data file into a DataFrame
    data = pd.read_csv(file_path)
    
    # Calculate the percentage of missing values in each column
    missing_percentage = data.isnull().sum() / len(data) * 100
    
    # Filter columns with missing percentage greater than X%
    filtered_columns = missing_percentage[missing_percentage > 20]
    
    # Print the reservoir name and filtered columns with missing percentage
    if len(filtered_columns) > 0:
        print(f'Reservoir: {reservoir_name}')
        for column, percentage in filtered_columns.items():
            print(f'Column: {column}, Missing Percentage: {percentage:.2f}%')
        
        print('-' * 50)

Reservoir: grupiara_var
Column: height, Missing Percentage: 28.69%
--------------------------------------------------
Reservoir: ilha_solteria
Column: height, Missing Percentage: 26.50%
--------------------------------------------------
Reservoir: irape_var
Column: tws, Missing Percentage: 20.55%
Column: volume, Missing Percentage: 100.00%
Column: discharge, Missing Percentage: 100.00%
Column: Res_time, Missing Percentage: 100.00%
Column: depth, Missing Percentage: 100.00%
Column: wshd_area, Missing Percentage: 100.00%
--------------------------------------------------
Reservoir: itaipu_var
Column: Unnamed: 15, Missing Percentage: 100.00%
--------------------------------------------------
Reservoir: itumbiara_var
Column: discharge, Missing Percentage: 100.00%
Column: Res_time, Missing Percentage: 100.00%
Column: depth, Missing Percentage: 100.00%
Column: wshd_area, Missing Percentage: 100.00%
Column: elevation, Missing Percentage: 100.00%
-----------------------------------------------

In [15]:
import os
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


# Loop through each file in the folder
for file_name in file_names:
    file_path = os.path.join(folder_path, file_name)
    
    # Load the data file into a DataFrame
    data = pd.read_csv(file_path)
    
    # Select the columns for imputation
    columns_to_impute = ['height', 'tws']
    
    # Create an IterativeImputer object
    imputer = IterativeImputer()
    
    # Impute missing values in the selected columns
    imputed_values = imputer.fit_transform(data[columns_to_impute])
    
    # Create a DataFrame with the imputed values
    imputed_data = pd.DataFrame(imputed_values, columns=columns_to_impute)
    
    # Combine the imputed data with the non-imputed columns
    non_imputed_columns = [col for col in data.columns if col not in columns_to_impute]
    combined_data = pd.concat([imputed_data, data[non_imputed_columns]], axis=1)
    
    # Create a new file name for the combined data
    combined_file_name = f'combined_{file_name}'
    new_folder_path = 'D:/internship 2022/data/regression_variables/imputed_variables'
    combined_file_path = os.path.join(new_folder_path, combined_file_name)
    
    # Export the combined data to a new CSV file
    combined_data.to_csv(combined_file_path, index=False)
    
    print(f'Combined data (imputed and non-imputed) saved to: {combined_file_path}')


Combined data (imputed and non-imputed) saved to: D:/internship 2022/data/regression_variables/imputed_variables\combined_embocaco_var.csv
Combined data (imputed and non-imputed) saved to: D:/internship 2022/data/regression_variables/imputed_variables\combined_frunas_var.csv
Combined data (imputed and non-imputed) saved to: D:/internship 2022/data/regression_variables/imputed_variables\combined_grupiara_var.csv
Combined data (imputed and non-imputed) saved to: D:/internship 2022/data/regression_variables/imputed_variables\combined_iepe_var.csv
Combined data (imputed and non-imputed) saved to: D:/internship 2022/data/regression_variables/imputed_variables\combined_iguazu_var.csv
Combined data (imputed and non-imputed) saved to: D:/internship 2022/data/regression_variables/imputed_variables\combined_ilha_solteria.csv
Combined data (imputed and non-imputed) saved to: D:/internship 2022/data/regression_variables/imputed_variables\combined_irape_var.csv
Combined data (imputed and non-impute

In [11]:

# Loop through each file in the folder and impute missing values
for file_name in file_names:
    file_path = os.path.join(folder_path, file_name)
    data = pd.read_csv(file_path)
    
    # Store the dates column separately
    dates = data['date']
    
    # Remove the dates column from the DataFrame
    data = data.drop('date', axis=1)
    
    # Create an IterativeImputer object
    imputer = IterativeImputer()
    
    # Impute missing values in the data
    imputed_array = imputer.fit_transform(data)
    
    # Convert the imputed array back to a DataFrame
    imputed_df = pd.DataFrame(imputed_array, columns=data.columns)
    
    # Add the dates column back to the DataFrame
    imputed_df['date'] = dates
    
    # Add the imputed DataFrame to the list
    imputed_data.append(imputed_df)

# Merge the imputed DataFrames on the 'Date' column
combined_data = imputed_data[0]  # Start with the first DataFrame
for i in range(1, len(imputed_data)):
    combined_data = pd.merge(combined_data, imputed_data[i], on='date')

# Optional: Sort the DataFrame by the 'Date' column
combined_data = combined_data.sort_values(by='date').reset_index(drop=True)


TypeError: Index(...) must be called with a collection of some kind, 9 was passed