In [None]:
#LOAD FILE
from google.colab import files

uploaded_files = files.upload()
uploaded_file_name = list(uploaded_files.keys())[0]
# Define column names to use after skipping rows --> solution to file loading issue
column_names = ['TOTUSJH', 'TOTBSQ', 'TOTPOT', 'TOTUSJZ',
    'ABSNJZH', 'SAVNCPP', 'USFLUX', 'AREA_ACR', 'TOTFZ', 'MEANPOT',
    'R_VALUE', 'EPSZ', 'SHRGT45'
]

df = pd.read_csv(uploaded_file_name, delimiter=';', encoding='latin1', skiprows=2, usecols=range(4, 17), names=column_names)

print("Raw Data:")
print(df.head())

df = df.replace({',': '.'}, regex=True)

#verify all is converted to float
for col in column_names:
    df[col] = df[col].astype(float)

print("\nDataFrame after conversion to float:")
print(df.head())

#handle missing values in df
df.fillna(0, inplace=True)

#drop target variable
df=df.drop(columns=["Flare Class"])

In [None]:
# SCALE DATA (feature variables)

from itertools import combinations
from sklearn.preprocessing import Standardscaler

scaler = StandardScaler()

# Fit and transform the data
scaled_data = scaler.fit_transform(df[column_names])
scaled_df = pd.DataFrame(scaled_data, columns=column_names)

for column in column_names:
    df[column] = scaled_df[column]


print("\nScaled Data:")
print(df.head())


In [None]:
# CREATE INTERACTION FEATURES

import itertools
from sklearn.preprocessing import PolynomialFeatures

def add_interactions(df):
    # Ensure the column names are strings
    df.columns = [str(col) for col in df.columns]

    # Generate combinations of column names
    combos = list(combinations(df.columns, 2))

    # Create the column names for the interaction terms
    colnames = list(df.columns) + ['_'.join(combo) for combo in combos]

    # Create PolynomialFeatures object
    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    df_transformed = poly.fit_transform(df)

    # Create a DataFrame with the original and interaction columns
    df_interactions = pd.DataFrame(df_transformed, columns=colnames)

    # Remove columns where all values are zero (no interaction)
    noint_indices = [i for i, x in enumerate((df_interactions == 0).all()) if x]
    df_interactions = df_interactions.drop(df_interactions.columns[noint_indices], axis=1)

    return df_interactions

# Add interaction features
df_interactions = add_interactions(df)
print("Data with Interaction Features:")
print(df_interactions.head())



In [None]:
# COVARIANCE HEATMAP

cov_matrix = df.cov()

# Plot the covariance matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cov_matrix, annot=True, cmap='coolwarm')
plt.title("Covariance Matrix Heatmap")
plt.show()

In [None]:
# DETERMINE CUMMULATIVE EXPLAINED VARIANCE (= total amount of variance captured by a certain number of principal components)

# Ensure your data (X) is standardized before applying PCA
from sklearn.decomposition import PCA

pca = PCA()
# Fit PCA on the data
pca.fit(df_interactions)

# Plot the cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='--')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance by Principal Components')
plt.grid()
plt.show()

# Determine the number of components to explain desired variance (e.g., 95%)
variance_threshold = 0.95
n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= variance_threshold) + 1

print(f"Number of components to explain {variance_threshold*100}% variance: {n_components}")


In [None]:
# TARGET VARAIABLE (Solar flare class) ENCODING FROM GOES CLASS TO NUMERIC (hierarchical)

def encode_flare_class(flare_class):
    main_class = flare_class[0]
    sub_class = int(flare_class[1:])
    mapping = {'A': 0, 'B': 100, 'C': 200, 'M': 300, 'X': 400}
    return mapping[main_class] + sub_class

df['flare_class_encoded'] = df['Flare Class'].apply(encode_flare_class)

