In [None]:
#These Libraries are very neccessary
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
file_path = r'C:\Users\ekuej\Downloads\trial\PPP.csv'
data = pd.read_csv(file_path)
data['Datetime'] = pd.to_datetime(data['Year'].astype(str) + data['Day'].astype(str), format='%Y%j') + pd.to_timedelta(data['Hour'], unit='h')
data.set_index('Datetime', inplace=True)

In [None]:
#For this project we wont need this parameters so this code will remove the following parameters from the data
data.drop(columns=['Year','Day','Hour','Ap Index','AL index','AE index','AU index'],inplace=True)

In [None]:
 # This was done because there were values that were spiking this could be due to the fact that
cutoff_value = 900  
data[data > cutoff_value] = pd.NA

# Interpolate the missing values (NaN)
data = data.interpolate(method='linear')
    

In [None]:

def calculate_time_lagged_correlation(data, lag, fields, indices):
    correlations = pd.DataFrame(index=fields, columns=indices)
    for field in fields:
        for index in indices:
            shifted_data = data[field].shift(lag)  # Shift the field data by the lag
            correlation = data[index].corr(shifted_data)  # Calculate correlation with the index
            correlations.at[field, index] = correlation
    return correlations

lags = list(range(-24, 25))
fields = ['B,nt', 'Proton Density', 'Proton Speed']  # Replace with your actual field names
indices = ['Kp Index', 'Dst-Index']  # Replace with your actual index names

time_lagged_cross_correlation = {}
for lag in lags:
    time_lagged_cross_correlation[lag] = calculate_time_lagged_correlation(data, lag, fields, indices)

# Step 3: Plot all time-lagged correlations
def plot_all_time_lagged_correlations(correlations, lags, fields, indices):
    plt.figure(figsize=(10, 6))

    for field in fields:
        for index in indices:
            correlation_values = []
            for lag in lags:
                correlation_values.append(correlations[lag].loc[field, index])

            plot_data = pd.DataFrame({'Lag': lags, 'Correlation': correlation_values})
            sns.lineplot(data=plot_data, x='Lag', y='Correlation', label=f"{field} vs {index}")
    
    plt.axhline(0, color='grey', linestyle='--')
    plt.title("Time-Lagged Cross-Correlations")
    plt.xlabel('Time Lag (Hours)')
    plt.ylabel('Correlation Coefficient')
    plt.legend()
    plt.grid(True)
    plt.show()

# Call the function to plot all correlations
plot_all_time_lagged_correlations(time_lagged_cross_correlation, lags, fields, indices)


In [None]:
df_heat = data.copy()

# Function for time window analysis
def time_window_analysis(df, column_name):
    # Resampling for different time windows
    time_windows = {
        'H': df.resample('H').sum(),  # Hourly
        'D': df.resample('D').sum(),  # Daily
        'W': df.resample('W').sum(),  # Weekly
        'M': df.resample('M').sum(),  # Monthly
        'Q': df.resample('Q').sum(),  # Quarterly
        'Y': df.resample('Y').sum()   # Yearly
    }

    # Dictionary to define periods for difference calculations
    periods_diff = {
        "H": 24,  # for hourly data, compare with the same hour the previous day
        "D": 1,   # for daily data, compare with the previous day
        "W": 1,   # for weekly data, compare with the previous week
        "M": 1,   # for monthly data, compare with the previous month
        "Q": 1,   # for quarterly data, compare with the previous quarter
        "Y": 1    # for annual data, compare with the previous year
    }

    for freq, df_resampled in time_windows.items():
        # Basic calculations
        df_resampled = df_resampled[[column_name]]
        df_resampled = df_resampled.join(df_resampled.pct_change().add_suffix('_%_Change'))
        df_resampled = df_resampled.join(np.log(1 + df_resampled).add_suffix('_Log'))

        # Difference calculations
        period = periods_diff[freq]
        df_resampled[f'{column_name}_Log_Diff{period}'] = df_resampled[f'{column_name}_Log'].diff(periods=period)
        df_resampled[f'{column_name}_Log_Diff{period}_Diff'] = df_resampled[f'{column_name}_Log_Diff{period}'].diff()
        
        # Rolling window calculations
        window = 7 if freq in ['H', 'D', 'W'] else 4  # 7 days for shorter periods, 4 periods for longer ones
        df_resampled[f'{column_name}_Rolling_Window_Mean'] = df_resampled[column_name].rolling(window=window).mean()
        df_resampled[f'{column_name}_Rolling_Window_Std'] = df_resampled[column_name].rolling(window=window).std()
        df_resampled[f'{column_name}_Cumulative'] = df_resampled[column_name].cumsum()

        # Plotting
        df_resampled.plot(subplots=True, figsize=(10,10 ), title=f'{column_name}_{freq}_Time_Window')
        plt.show()

        print(f'{column_name}_{freq}_Time_Window Analysis')
        print(df_resampled.head(), '\n')

# Applying the time window analysis to the selected columns
columns_to_analyze = ['B,nt','Proton Density', 'Proton Speed', 'Kp Index', 'Dst-Index']
for column in columns_to_analyze:
    time_window_analysis(df_heat, column)

In [None]:

extended_indices = data[['Kp Index', 'Dst-Index']]
magnetic_fields = data[['BX,nt', 'BY,nt', 'BZ,nt']]

# Concatenate the two DataFrames for correlation analysis
combined_data = pd.concat([extended_indices, magnetic_fields], axis=1)

# Calculating Pearson and Spearman correlations
pearson_correlations = combined_data.corr(method='pearson')
spearman_correlations = combined_data.corr(method='spearman')

# Setting up the plot for Pearson correlations
plt.figure(figsize=(8, 5))
sns.heatmap(pearson_correlations, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Pearson Correlation Matrix')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tight_layout()
# pearson_plot_file_path = '/mnt/data/pearson_correlation_plot.png'
# plt.savefig(r'C:\Users\ekuej\Downloads\trial\pearson.png')
plt.show()

# Setting up the plot for Spearman correlations
plt.figure(figsize=(8, 5))
sns.heatmap(spearman_correlations, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Spearman Correlation Matrix')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tight_layout()
# spearman_plot_file_path = '/mnt/data/spearman_correlation_plot.png'
# plt.savefig(r'C:\Users\ekuej\Downloads\trial\spearman.png')
plt.show()

# pearson_plot_file_path, spearman_plot_file_path


In [None]:
# Function to create scatter plots for comparison
def compare_parameters(df, param1, param2, time_intervals):
    fig, axes = plt.subplots(nrows=1, ncols=len(time_intervals), figsize=(15, 5), sharey=True)
    fig.suptitle(f'Comparison of {param1} and {param2} over Different Time Intervals')

    for ax, interval in zip(axes, time_intervals):
        # Resampling data
        resampled_data = df.resample(interval).mean()

        # Scatter plot
        ax.scatter(resampled_data[param1], resampled_data[param2], alpha=0.5)
        ax.set_title(f'{interval} Interval')
        ax.set_xlabel(param1)
        ax.set_ylabel(param2)

    plt.tight_layout()
    plt.show()

# Resample intervals
time_intervals = ['D', 'M', 'Y']  # Hourly, Quarterly, Yearly

# Comparisons
pairs_to_compare = [('Proton Density', 'Kp Index'), 
                    ('Proton Density', 'Dst-Index'), 
                    ('B,nt', 'Kp Index'), 
                    ('B,nt', 'Dst-Index'),
                    ('Proton Speed', 'Kp Index'),
                    ('Proton Speed', 'Dst-Index')]

# Apply the function to each pair
for pair in pairs_to_compare:
    compare_parameters(df_heat, pair[0], pair[1], time_intervals)
