1 - import data

In [4]:
import pandas as pd
import re
from collections import Counter  # (optional)

# -------- CONFIG --------
path = r"C:\Users\HP\Desktop\Jupyter Notebooks\UOW Projects\Distinguishing Amateur Players and Professional Players\Data\Sam\Sam_EPOC_179525_2023.03.15T15.13.57Z.md.mc.pm.fe.bp.csv"
expected_min_cols = 170
encodings_to_try = ["utf-8-sig", "utf-8", "latin-1"]

candidates = {
    ",":       {"is_regex": False},
    ";":       {"is_regex": False},
    "\t":      {"is_regex": False},
    "|":       {"is_regex": False},
    r"\s+":    {"is_regex": True},
}

# -------- STEP 1: decode a text sample for analysis --------
raw = None
text = None
used_encoding = None

for enc in encodings_to_try:
    try:
        with open(path, "rb") as f:
            raw = f.read()
        text = raw.decode(enc, errors="strict")
        used_encoding = enc
        break
    except Exception:
        continue

if text is None:
    raise RuntimeError("Could not decode file with utf-8-sig, utf-8, or latin-1.")

lines = text.splitlines()
sample_n = min(200, len(lines))
sample_lines = lines[:sample_n]

# -------- STEP 2: detect the header line and the best delimiter --------
def field_count(line, sep, is_regex):
    if not line.strip():
        return 0
    if is_regex:
        parts = re.split(sep, line.strip())
    else:
        parts = line.strip().split(sep)
    return len(parts)

best = None  # (max_fields, line_idx, sep, is_regex)

for idx, line in enumerate(sample_lines):
    if not line.strip():
        continue
    for sep, meta in candidates.items():
        cnt = field_count(line, sep, meta["is_regex"])
        if best is None or cnt > best[0]:
            best = (cnt, idx, sep, meta["is_regex"])

if best is None:
    raise RuntimeError("Could not identify any plausible header line.")

best_count, header_idx, best_sep, best_is_regex = best

if best_count < 50:
    print(f"⚠️ Detected only {best_count} fields on line {header_idx}. Continuing anyway.")

# -------- STEP 3: read the file from the detected header line --------
read_kwargs = {
    "encoding": used_encoding,
    "header": 0,
    "skiprows": header_idx,
    "dtype": str,
    "quotechar": '"',
    "doublequote": True,
    "na_filter": False,
}

if best_is_regex:
    read_kwargs["sep"] = best_sep
    read_kwargs["engine"] = "python"
else:
    read_kwargs["sep"] = best_sep
    read_kwargs["engine"] = "c"

df = pd.read_csv(path, **read_kwargs)

# -------- STEP 4: diagnostics and hard check --------
print(f"Detected encoding: {used_encoding}")
print(f"Detected header line index: {header_idx}")
print(f"Detected separator: {'REGEX ' if best_is_regex else ''}{best_sep}")
print(f"Shape: {df.shape}")

if df.shape[1] < expected_min_cols:
    hdr_preview = sample_lines[header_idx][:200].replace("\t", "\\t")
    print(f"⚠️ Columns < {expected_min_cols}. Header preview: {hdr_preview}")

assert df.shape[1] >= expected_min_cols, (
    f"Only {df.shape[1]} columns parsed; expected ≥ {expected_min_cols} (A..FN)."
)

print(df.columns.tolist()[:12])
print(df.head(3))


Detected encoding: utf-8-sig
Detected header line index: 1
Detected separator: ,
Shape: (202780, 170)
['Timestamp', 'OriginalTimestamp', 'EEG.Counter', 'EEG.Interpolated', 'EEG.AF3', 'EEG.F7', 'EEG.F3', 'EEG.FC5', 'EEG.T7', 'EEG.P7', 'EEG.O1', 'EEG.O2']
           Timestamp  OriginalTimestamp EEG.Counter EEG.Interpolated  \
0  1678893237.188589  1678893237.189102   29.000000         0.000000   
1  1678893237.196409  1678893237.196821   30.000000         0.000000   
2  1678893237.204229  1678893237.204641   31.000000         0.000000   

       EEG.AF3       EEG.F7       EEG.F3      EEG.FC5       EEG.T7  \
0  3710.769287  4365.641113  4182.051270  4408.205078  4428.717773   
1  3707.692383  4362.051270  4177.948730  4408.205078  4428.205078   
2  3705.128174  4357.948730  4175.384766  4410.256348  4428.717773   

        EEG.P7  ... POW.F8.Theta POW.F8.Alpha POW.F8.BetaL POW.F8.BetaH  \
0  4344.615234  ...                                                       
1  4341.025879  ...       

2 - preprare dataset

In [5]:
data = df.copy()

In [6]:
# Keep only columns E..R (Excel letters). 
# A=0,B=1,C=2,D=3,E=4,...,R=17 (0-based). iloc stop is exclusive, so use 18.
start_idx = 4           # E
stop_idx_exclusive = 18 # R + 1

# Clamp the stop index in case the file has fewer columns
stop_idx_exclusive = min(stop_idx_exclusive, data.shape[1])

data = data.iloc[:, start_idx:stop_idx_exclusive]

# (Optional) quick check
print(data.shape)

(202780, 14)


In [7]:
data.head()

Unnamed: 0,EEG.AF3,EEG.F7,EEG.F3,EEG.FC5,EEG.T7,EEG.P7,EEG.O1,EEG.O2,EEG.P8,EEG.T8,EEG.FC6,EEG.F4,EEG.F8,EEG.AF4
0,3710.769287,4365.641113,4182.05127,4408.205078,4428.717773,4344.615234,4461.538574,4046.153809,4265.128418,3738.974365,3883.589844,4459.487305,4316.410156,4405.128418
1,3707.692383,4362.05127,4177.94873,4408.205078,4428.205078,4341.025879,4465.641113,4048.718018,4263.077148,3739.487061,3886.666748,4461.538574,4319.487305,4401.538574
2,3705.128174,4357.94873,4175.384766,4410.256348,4428.717773,4342.05127,4466.153809,4048.205078,4256.922852,3738.974365,3884.615479,4462.05127,4327.692383,4400.512695
3,3707.692383,4365.128418,4179.487305,4412.307617,4428.205078,4343.077148,4464.615234,4045.128174,4260.0,3738.974365,3881.025635,4460.512695,4335.384766,4401.538574
4,3710.256348,4376.922852,4182.05127,4417.94873,4426.666504,4342.563965,4463.077148,4048.205078,4264.102539,3738.974365,3881.538574,4460.0,4336.410156,4398.461426


In [8]:
# keep middle 38400 rows by excluding others from the begining and the end
n_keep = 38400
n_total = len(data)

if n_total < n_keep:
    raise ValueError(f"Data has only {n_total} rows, cannot select {n_keep}.")

start = (n_total - n_keep) // 2
end = start + n_keep

data = data.iloc[start:end].reset_index(drop=True)


In [9]:
data.shape

(38400, 14)

In [10]:
# get the column names as an array
import numpy as np
cols = data.columns.to_numpy()


In [11]:
cols

array(['EEG.AF3', 'EEG.F7', 'EEG.F3', 'EEG.FC5', 'EEG.T7', 'EEG.P7',
       'EEG.O1', 'EEG.O2', 'EEG.P8', 'EEG.T8', 'EEG.FC6', 'EEG.F4',
       'EEG.F8', 'EEG.AF4'], dtype=object)

In [12]:
# save channel names as an array

import numpy as np
import os

out_dir = r"C:\Users\HP\Desktop\Jupyter Notebooks\UOW Projects\Distinguishing Amateur Players and Professional Players\Data\Sam"
out_path = os.path.join(out_dir, "channel_names.npy")

channel_names = data.columns.to_numpy()
np.save(out_path, channel_names)

print(f"Saved {len(channel_names)} channel names to {out_path}")


Saved 14 channel names to C:\Users\HP\Desktop\Jupyter Notebooks\UOW Projects\Distinguishing Amateur Players and Professional Players\Data\Sam\channel_names.npy


In [13]:
# create 2D array instead of the dataset

import numpy as np

data_2d = data.to_numpy()


In [14]:
print(data_2d.shape)   # (rows, columns)
print(type(data_2d))   # <class 'numpy.ndarray'>

(38400, 14)
<class 'numpy.ndarray'>


In [15]:
data_2d_T = data_2d.T

In [16]:
data_2d_T.shape

(14, 38400)

In [17]:
# segment the data into 128 size segments

import numpy as np

seg_len = 128
channels, total_samples = data_2d_T.shape

n_segments = total_samples // seg_len

# Trim to a clean multiple of 128
usable = n_segments * seg_len
data_2d_T_trim = data_2d_T[:, :usable]

# Segment:
# (channels, total_samples)
# → (channels, segments, 128)
# → (segments, channels, 128)
segments = data_2d_T_trim.reshape(
    channels,
    n_segments,
    seg_len
).transpose(1, 0, 2)

print("segments.shape:", segments.shape)


segments.shape: (300, 14, 128)


In [18]:
import numpy as np
import os

out_dir = r"C:\Users\HP\Desktop\Jupyter Notebooks\UOW Projects\Distinguishing Amateur Players and Professional Players\Data\Sam"
out_path = os.path.join(out_dir, "Sam.npy")

np.save(out_path, segments)

print(f"Saved segments to {out_path}")
print("Saved shape:", segments.shape)


Saved segments to C:\Users\HP\Desktop\Jupyter Notebooks\UOW Projects\Distinguishing Amateur Players and Professional Players\Data\Sam\Sam.npy
Saved shape: (300, 14, 128)


In [19]:
import numpy as np
import os

out_dir = r"C:\Users\HP\Desktop\Jupyter Notebooks\UOW Projects\Distinguishing Amateur Players and Professional Players\Data\Sam"
out_path = os.path.join(out_dir, "Sam_target.npy")

y = np.ones(300, dtype=np.int64)

np.save(out_path, y)

print(f"Saved target array to {out_path}")
print("Saved shape:", y.shape)


Saved target array to C:\Users\HP\Desktop\Jupyter Notebooks\UOW Projects\Distinguishing Amateur Players and Professional Players\Data\Sam\Sam_target.npy
Saved shape: (300,)
