In [11]:
import pandas as pd

In [12]:
data_path = "../data/combined/"
data_file = "amplitude_csi_dataframe.pkl"

DISCRETE_VARIABLES = ["person"]
TARGET_VARIABLE = "position"
STATE = 42

data_df: pd.DataFrame = pd.read_pickle(data_path + data_file)

# Convert all column names to strings
data_df.columns = data_df.columns.astype(str)

print(data_df.head())

   person  position            6            7            8            9  \
0       1        17   795.910156   849.388000   890.166809   912.882263   
1       1        17   798.279419   843.614258   868.484314   895.013977   
2       1        17  1064.543091  1086.945312  1105.320312  1135.975342   
3       1        17  1060.771362  1092.156128  1112.137573  1130.086670   
4       1        17  1329.939087  1409.457397  1416.469604  1432.482056   

            10           11           12           13  ...        241  \
0   946.926086   979.547363  1059.871704  1146.253052  ...  52.201534   
1   921.195984   935.745667  1021.312866  1114.450928  ...  33.526108   
2  1170.029907  1180.800171  1267.976318  1369.742310  ...  47.539455   
3  1138.223999  1126.414673  1193.649902  1277.542969  ...  26.870058   
4  1445.448364  1401.856567  1508.621948  1599.805298  ...  51.156624   

         242        243        244        245         246         247  \
0  41.773197  24.515301  16.552946  3

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

numerical_columns = [
    col
    for col in data_df.columns
    if col not in DISCRETE_VARIABLES and col != TARGET_VARIABLE
]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", "passthrough", DISCRETE_VARIABLES),
        ("num", numeric_transformer, numerical_columns),
    ]
)

X = data_df.drop(columns=[TARGET_VARIABLE])
X = preprocessor.fit_transform(X)
y = data_df[TARGET_VARIABLE]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=STATE
)

print(X_train[:5])
print(y_train[:5])
print(X_test[:5])
print(y_test[:5])

[[44.          0.38642046  0.29976055 ...  2.169638    2.7544522
  -0.07609867]
 [26.         -0.69760364 -0.6823467  ... -0.5210067  -0.4048676
  -0.7114199 ]
 [52.         -0.6339469  -0.64925706 ... -0.5595189  -0.6062821
   0.19943404]
 [ 8.         -0.88174653 -0.9246833  ... -1.0051556  -0.37353423
  -0.69077873]
 [49.          1.1035314   1.0955558  ... -0.37058067 -0.4393156
   0.4879667 ]]
1559868    17
903800     12
1865988     8
277030      7
1732226    13
Name: position, dtype: uint8
[[46.         -0.21375568 -0.2730182  ... -0.98993266 -0.86679655
  -0.8598902 ]
 [10.         -1.2045823  -1.1377039  ... -0.75811535  0.9736769
  -0.10447231]
 [33.         -1.7443719  -1.7738211  ... -0.71422815 -0.5750575
  -0.17337221]
 [56.         -1.4034363  -1.4370869  ... -0.5831633  -0.74591374
  -0.72425234]
 [32.         -1.1189996  -1.1041296  ... -0.22216593 -0.5311101
  -0.33968678]]
1625434    13
323710     10
1151379     0
1993553     1
1135407     3
Name: position, dtype: uin

In [14]:
save_path = "../data/train_test_split/"

def save_pkl(obj: object, path: str) -> None:
    with open(path, "wb") as f:
        pd.to_pickle(obj, f)

save_pkl(X_train, save_path + "X_train.pkl")
save_pkl(y_train, save_path + "y_train.pkl")
save_pkl(X_test, save_path + "X_test.pkl")
save_pkl(y_test, save_path + "y_test.pkl")

print("Data saved to", save_path)

Data saved to ../data/train_test_split/


In [15]:
import torch

# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

torch.save(X_train_tensor, save_path + "X_train.pt")
torch.save(y_train_tensor, save_path + "y_train.pt")
torch.save(X_test_tensor, save_path + "X_test.pt")
torch.save(y_test_tensor, save_path + "y_test.pt")

print("Data saved to", save_path)

Data saved to ../data/train_test_split/


In [16]:
train_X = torch.load(save_path + "X_train.pt")
train_y = torch.load(save_path + "y_train.pt")
test_X = torch.load(save_path + "X_test.pt")
test_y = torch.load(save_path + "y_test.pt")

print(train_X[:5])
print(train_y[:5])
print(test_X[:5])
print(test_y[:5])

  train_X = torch.load(save_path + "X_train.pt")
  train_y = torch.load(save_path + "y_train.pt")
  test_X = torch.load(save_path + "X_test.pt")


tensor([[44.0000,  0.3864,  0.2998,  ...,  2.1696,  2.7545, -0.0761],
        [26.0000, -0.6976, -0.6823,  ..., -0.5210, -0.4049, -0.7114],
        [52.0000, -0.6339, -0.6493,  ..., -0.5595, -0.6063,  0.1994],
        [ 8.0000, -0.8817, -0.9247,  ..., -1.0052, -0.3735, -0.6908],
        [49.0000,  1.1035,  1.0956,  ..., -0.3706, -0.4393,  0.4880]])
tensor([17, 12,  8,  7, 13])
tensor([[46.0000, -0.2138, -0.2730,  ..., -0.9899, -0.8668, -0.8599],
        [10.0000, -1.2046, -1.1377,  ..., -0.7581,  0.9737, -0.1045],
        [33.0000, -1.7444, -1.7738,  ..., -0.7142, -0.5751, -0.1734],
        [56.0000, -1.4034, -1.4371,  ..., -0.5832, -0.7459, -0.7243],
        [32.0000, -1.1190, -1.1041,  ..., -0.2222, -0.5311, -0.3397]])
tensor([13, 10,  0,  1,  3])


  test_y = torch.load(save_path + "y_test.pt")
