In [13]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
from sklearn.feature_selection import VarianceThreshold

In [14]:
input_path = Path("../data/processed/cleaned_gravity_speed_data.csv")
df = pd.read_csv(input_path)

print(f"Data loaded: {df.shape[0]} rows and {df.shape[1]} columns.")
df.head()

Data loaded: 1000 rows and 9 columns.


Unnamed: 0,mass_kg,radius_m,velocity_m_s,gravitational_dilation,velocity_dilation,combined_dilation,time_far_s,time_near_s,time_difference_s
0,6.620184e+29,5356206000.0,81283750.0,1.0,1.038859,0.962595,1.0,0.962595,0.037405
1,8.862898e+29,8749384000.0,120781200.0,1.0,1.092449,0.915374,1.0,0.915374,0.084626
2,2.994138e+29,2869731000.0,62450750.0,1.0,1.022398,0.978093,1.0,0.978093,0.021907
3,4.3020859999999995e+29,6169629000.0,199736600.0,1.0,1.340231,0.74614,1.0,0.74614,0.25386
4,3.1167459999999997e+29,2609683000.0,157900600.0,1.0,1.176087,0.850277,1.0,0.850277,0.149723


In [15]:
log_cols = df.select_dtypes(include=["float64", "int64"]).columns[
    (df > 0).all()
].tolist()

log_transformer = FunctionTransformer(np.log1p, validate=True)

log_df = pd.DataFrame(
    log_transformer.fit_transform(df[log_cols]),
    columns=[f"log1p_{col}" for col in log_cols]
)

df = pd.concat([df.reset_index(drop=True), log_df.reset_index(drop=True)], axis=1)

print(f"🔢 Applied log1p transform to {len(log_cols)} positive numeric columns.")

🔢 Applied log1p transform to 9 positive numeric columns.


In [16]:
poly_features = df.select_dtypes(include=["float64", "int64"]).columns.tolist()
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_array = poly.fit_transform(df[poly_features])
poly_feature_names = poly.get_feature_names_out(poly_features)
poly_df = pd.DataFrame(poly_array, columns=poly_feature_names)
poly_df = poly_df.drop(columns=poly_features)
df = pd.concat([df.reset_index(drop=True), poly_df.reset_index(drop=True)], axis=1)
print(f"🔧 Added {poly_df.shape[1]} polynomial features (degree=2, without original features).")

🔧 Added 171 polynomial features (degree=2, without original features).


In [17]:
corr_matrix = df.corr(numeric_only=True).abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.97)]

df.drop(columns=to_drop, inplace=True)
print(f"Dropped {len(to_drop)} highly correlated features.")

Dropped 158 highly correlated features.


In [18]:
selector = VarianceThreshold(threshold=0.0)
reduced_data = selector.fit_transform(df.select_dtypes(include=[np.number]))

selected_cols = df.select_dtypes(include=[np.number]).columns[selector.get_support()]
df = df[selected_cols]

print(f"Remaining features after variance threshold: {df.shape[1]}")

Remaining features after variance threshold: 26


In [19]:
output_path = Path("../data/processed/gravity_data_engineered.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)

df.to_csv(output_path, index=False)
print(f"Feature-engineered dataset saved to: {output_path}")


Feature-engineered dataset saved to: ..\data\processed\gravity_data_engineered.csv
