1. Import Libraries

In [44]:
import pandas as pd
import csv

2. Load the EEG Data

In [45]:
import pandas as pd
import re
from collections import Counter  # (optional)

# -------- CONFIG --------
path = r"C:\Self Learning\Research Papers\UOW Research Papers\eSport Players with EEG Data\EEG Data\Mina 3\Mina 3_EPOC_186247_2023.05.25T00.53.36+01.00.md.mc.pm.fe.bp.csv"
expected_min_cols = 170
encodings_to_try = ["utf-8-sig", "utf-8", "latin-1"]

candidates = {
    ",":       {"is_regex": False},
    ";":       {"is_regex": False},
    "\t":      {"is_regex": False},
    "|":       {"is_regex": False},
    r"\s+":    {"is_regex": True},
}

# -------- STEP 1: decode a text sample for analysis --------
raw = None
text = None
used_encoding = None

for enc in encodings_to_try:
    try:
        with open(path, "rb") as f:
            raw = f.read()
        text = raw.decode(enc, errors="strict")
        used_encoding = enc
        break
    except Exception:
        continue

if text is None:
    raise RuntimeError("Could not decode file with utf-8-sig, utf-8, or latin-1.")

lines = text.splitlines()
sample_n = min(200, len(lines))
sample_lines = lines[:sample_n]

# -------- STEP 2: detect the header line and the best delimiter --------
def field_count(line, sep, is_regex):
    if not line.strip():
        return 0
    if is_regex:
        parts = re.split(sep, line.strip())
    else:
        parts = line.strip().split(sep)
    return len(parts)

best = None  # (max_fields, line_idx, sep, is_regex)

for idx, line in enumerate(sample_lines):
    if not line.strip():
        continue
    for sep, meta in candidates.items():
        cnt = field_count(line, sep, meta["is_regex"])
        if best is None or cnt > best[0]:
            best = (cnt, idx, sep, meta["is_regex"])

if best is None:
    raise RuntimeError("Could not identify any plausible header line.")

best_count, header_idx, best_sep, best_is_regex = best

if best_count < 50:
    print(f"⚠️ Detected only {best_count} fields on line {header_idx}. Continuing anyway.")

# -------- STEP 3: read the file from the detected header line --------
read_kwargs = {
    "encoding": used_encoding,
    "header": 0,                  # the first row after skiprows is the header
    "skiprows": header_idx,       # skip everything before the detected header
    "dtype": str,                 # keep 1:1 columns
    "quotechar": '"',
    "doublequote": True,
    "na_filter": False,
    # "on_bad_lines": "skip",     # uncomment if you hit malformed rows
}

if best_is_regex:
    read_kwargs["sep"] = best_sep
    read_kwargs["engine"] = "python"
else:
    read_kwargs["sep"] = best_sep
    read_kwargs["engine"] = "c"

df = pd.read_csv(path, **read_kwargs)

# -------- STEP 4: diagnostics and hard check --------
print(f"Detected encoding: {used_encoding}")
print(f"Detected header line index: {header_idx}")
print(f"Detected separator: {'REGEX ' if best_is_regex else ''}{best_sep}")
print(f"Shape: {df.shape}")

if df.shape[1] < expected_min_cols:
    hdr_preview = sample_lines[header_idx][:200].replace("\t", "\\t")
    print(f"⚠️ Columns < {expected_min_cols}. Header preview: {hdr_preview}")

assert df.shape[1] >= expected_min_cols, (
    f"Only {df.shape[1]} columns parsed; expected ≥ {expected_min_cols} (A..FN)."
)

print(df.columns.tolist()[:12])
print(df.head(3))


Detected encoding: utf-8-sig
Detected header line index: 1
Detected separator: ,
Shape: (54257, 170)
['Timestamp', 'OriginalTimestamp', 'EEG.Counter', 'EEG.Interpolated', 'EEG.AF3', 'EEG.F7', 'EEG.F3', 'EEG.FC5', 'EEG.T7', 'EEG.P7', 'EEG.O1', 'EEG.O2']
           Timestamp  OriginalTimestamp EEG.Counter EEG.Interpolated  \
0  1684972416.505165  1684972416.507844  110.000000         0.000000   
1  1684972416.512985  1684972416.515764  111.000000         0.000000   
2  1684972416.520805  1684972416.523584  112.000000         0.000000   

       EEG.AF3       EEG.F7       EEG.F3      EEG.FC5       EEG.T7  \
0  3617.948730  4062.051270  3960.512939  4249.230957  4428.205078   
1  3619.487061  4063.589844  3955.384521  4246.666504  4429.230957   
2  3615.897461  4060.512939  3957.435791  4246.153809  4427.692383   

        EEG.P7       EEG.O1       EEG.O2       EEG.P8       EEG.T8  \
0  4336.410156  4371.794922  3978.974365  4367.179688  3738.974365   
1  4338.974121  4366.153809  3964.102

In [46]:
data = df.copy()

3. Data Preprocessing

3.1. Extract the information of the dataset

In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54257 entries, 0 to 54256
Columns: 170 entries, Timestamp to POW.AF4.Gamma
dtypes: object(170)
memory usage: 70.4+ MB


In [48]:
data.columns

Index(['Timestamp', 'OriginalTimestamp', 'EEG.Counter', 'EEG.Interpolated',
       'EEG.AF3', 'EEG.F7', 'EEG.F3', 'EEG.FC5', 'EEG.T7', 'EEG.P7',
       ...
       'POW.F8.Theta', 'POW.F8.Alpha', 'POW.F8.BetaL', 'POW.F8.BetaH',
       'POW.F8.Gamma', 'POW.AF4.Theta', 'POW.AF4.Alpha', 'POW.AF4.BetaL',
       'POW.AF4.BetaH', 'POW.AF4.Gamma'],
      dtype='object', length=170)

In [49]:
pd.set_option('display.max_columns', None)  # Show every column (no truncation)
pd.set_option('display.width', 0)           # Let pandas auto-size the table to the notebook width
data.head()

Unnamed: 0,Timestamp,OriginalTimestamp,EEG.Counter,EEG.Interpolated,EEG.AF3,EEG.F7,EEG.F3,EEG.FC5,EEG.T7,EEG.P7,EEG.O1,EEG.O2,EEG.P8,EEG.T8,EEG.FC6,EEG.F4,EEG.F8,EEG.AF4,EEG.RawCq,EEG.Battery,EEG.BatteryPercent,MarkerIndex,MarkerType,MarkerValueInt,EEG.MarkerHardware,CQ.AF3,CQ.F7,CQ.F3,CQ.FC5,CQ.T7,CQ.P7,CQ.O1,CQ.O2,CQ.P8,CQ.T8,CQ.FC6,CQ.F4,CQ.F8,CQ.AF4,CQ.Overall,EQ.SampleRateQuality,EQ.OVERALL,EQ.AF3,EQ.F7,EQ.F3,EQ.FC5,EQ.T7,EQ.P7,EQ.O1,EQ.O2,EQ.P8,EQ.T8,EQ.FC6,EQ.F4,EQ.F8,EQ.AF4,MOT.CounterMems,MOT.InterpolatedMems,MOT.GyroX,MOT.GyroY,MC.Action,MC.ActionPower,MC.IsActive,PM.Engagement.IsActive,PM.Engagement.Scaled,PM.Engagement.Raw,PM.Engagement.Min,PM.Engagement.Max,PM.Excitement.IsActive,PM.Excitement.Scaled,PM.Excitement.Raw,PM.Excitement.Min,PM.Excitement.Max,PM.LongTermExcitement,PM.Stress.IsActive,PM.Stress.Scaled,PM.Stress.Raw,PM.Stress.Min,PM.Stress.Max,PM.Relaxation.IsActive,PM.Relaxation.Scaled,PM.Relaxation.Raw,PM.Relaxation.Min,PM.Relaxation.Max,PM.Interest.IsActive,PM.Interest.Scaled,PM.Interest.Raw,PM.Interest.Min,PM.Interest.Max,PM.Focus.IsActive,PM.Focus.Scaled,PM.Focus.Raw,PM.Focus.Min,PM.Focus.Max,FE.BlinkWink,FE.HorizontalEyesDirection,FE.UpperFaceAction,FE.UpperFaceActionPower,FE.LowerFaceAction,FE.LowerFaceActionPower,POW.AF3.Theta,POW.AF3.Alpha,POW.AF3.BetaL,POW.AF3.BetaH,POW.AF3.Gamma,POW.F7.Theta,POW.F7.Alpha,POW.F7.BetaL,POW.F7.BetaH,POW.F7.Gamma,POW.F3.Theta,POW.F3.Alpha,POW.F3.BetaL,POW.F3.BetaH,POW.F3.Gamma,POW.FC5.Theta,POW.FC5.Alpha,POW.FC5.BetaL,POW.FC5.BetaH,POW.FC5.Gamma,POW.T7.Theta,POW.T7.Alpha,POW.T7.BetaL,POW.T7.BetaH,POW.T7.Gamma,POW.P7.Theta,POW.P7.Alpha,POW.P7.BetaL,POW.P7.BetaH,POW.P7.Gamma,POW.O1.Theta,POW.O1.Alpha,POW.O1.BetaL,POW.O1.BetaH,POW.O1.Gamma,POW.O2.Theta,POW.O2.Alpha,POW.O2.BetaL,POW.O2.BetaH,POW.O2.Gamma,POW.P8.Theta,POW.P8.Alpha,POW.P8.BetaL,POW.P8.BetaH,POW.P8.Gamma,POW.T8.Theta,POW.T8.Alpha,POW.T8.BetaL,POW.T8.BetaH,POW.T8.Gamma,POW.FC6.Theta,POW.FC6.Alpha,POW.FC6.BetaL,POW.FC6.BetaH,POW.FC6.Gamma,POW.F4.Theta,POW.F4.Alpha,POW.F4.BetaL,POW.F4.BetaH,POW.F4.Gamma,POW.F8.Theta,POW.F8.Alpha,POW.F8.BetaL,POW.F8.BetaH,POW.F8.Gamma,POW.AF4.Theta,POW.AF4.Alpha,POW.AF4.BetaL,POW.AF4.BetaH,POW.AF4.Gamma
0,1684972416.505165,1684972416.507844,110.0,0.0,3617.94873,4062.05127,3960.512939,4249.230957,4428.205078,4336.410156,4371.794922,3978.974365,4367.179688,3738.974365,3912.307617,4455.384766,4152.820313,4198.461426,506.0,4.0,82.0,,,,0.0,4.0,2.0,4.0,2.0,0.0,4.0,2.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,16.666666,,,,,,,,,,,,,,,,,110.0,0.0,2199.0,1475.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1684972416.512985,1684972416.515764,111.0,0.0,3619.487061,4063.589844,3955.384521,4246.666504,4429.230957,4338.974121,4366.153809,3964.102539,4371.282227,3738.461426,3912.820557,4455.384766,4152.307617,4197.94873,506.0,4.0,82.0,,,,0.0,4.0,2.0,4.0,2.0,0.0,4.0,2.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,16.666666,,,,,,,,,,,,,,,,,111.0,0.0,2201.0,1475.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1684972416.520805,1684972416.523584,112.0,0.0,3615.897461,4060.512939,3957.435791,4246.153809,4427.692383,4345.128418,4372.307617,3963.589844,4369.743652,3738.974365,3910.256348,4451.794922,4147.179688,4187.692383,475.0,4.0,82.0,,,,0.0,4.0,2.0,4.0,2.0,0.0,4.0,2.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,16.666666,,,,,,,,,,,,,,,,,112.0,0.0,2200.0,1474.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1684972416.528625,1684972416.531503,113.0,0.0,3617.94873,4060.512939,3961.025635,4247.692383,4427.692383,4348.717773,4383.077148,3979.487061,4364.615234,3738.974365,3917.94873,4453.846191,4149.230957,4189.230957,466.0,4.0,82.0,,,,0.0,4.0,2.0,4.0,2.0,0.0,4.0,2.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,16.666666,,,,,,,,,,,,,,,,,113.0,0.0,2201.0,1475.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1684972416.536446,1684972416.539323,114.0,0.0,3621.538574,4062.564209,3957.94873,4246.153809,4429.230957,4344.615234,4374.358887,3980.0,4363.589844,3738.461426,3924.615479,4461.025879,4154.871582,4201.025879,506.0,4.0,82.0,,,,0.0,4.0,2.0,4.0,2.0,0.0,4.0,2.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,16.666666,,,,,,,,,,,,,,,,,114.0,0.0,2200.0,1474.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


3.2. Keep only the columns from E-R

In [50]:
# Keep only columns E..R (Excel letters). 
# A=0,B=1,C=2,D=3,E=4,...,R=17 (0-based). iloc stop is exclusive, so use 18.
start_idx = 4           # E
stop_idx_exclusive = 18 # R + 1

# Clamp the stop index in case the file has fewer columns
stop_idx_exclusive = min(stop_idx_exclusive, data.shape[1])

data = data.iloc[:, start_idx:stop_idx_exclusive]

# (Optional) quick check
print(data.shape)

(54257, 14)


In [51]:
data.head()

Unnamed: 0,EEG.AF3,EEG.F7,EEG.F3,EEG.FC5,EEG.T7,EEG.P7,EEG.O1,EEG.O2,EEG.P8,EEG.T8,EEG.FC6,EEG.F4,EEG.F8,EEG.AF4
0,3617.94873,4062.05127,3960.512939,4249.230957,4428.205078,4336.410156,4371.794922,3978.974365,4367.179688,3738.974365,3912.307617,4455.384766,4152.820313,4198.461426
1,3619.487061,4063.589844,3955.384521,4246.666504,4429.230957,4338.974121,4366.153809,3964.102539,4371.282227,3738.461426,3912.820557,4455.384766,4152.307617,4197.94873
2,3615.897461,4060.512939,3957.435791,4246.153809,4427.692383,4345.128418,4372.307617,3963.589844,4369.743652,3738.974365,3910.256348,4451.794922,4147.179688,4187.692383
3,3617.94873,4060.512939,3961.025635,4247.692383,4427.692383,4348.717773,4383.077148,3979.487061,4364.615234,3738.974365,3917.94873,4453.846191,4149.230957,4189.230957
4,3621.538574,4062.564209,3957.94873,4246.153809,4429.230957,4344.615234,4374.358887,3980.0,4363.589844,3738.461426,3924.615479,4461.025879,4154.871582,4201.025879


* First row is the header row. so it will not be applied in the simulation

3.3. Extract last 53k rows

In [52]:
data_last_53k = data.tail(53000).reset_index(drop=True)

print("data_last_53k shape:", data_last_53k.shape)

data_last_53k shape: (53000, 14)


In [53]:
data_last_53k.head()

Unnamed: 0,EEG.AF3,EEG.F7,EEG.F3,EEG.FC5,EEG.T7,EEG.P7,EEG.O1,EEG.O2,EEG.P8,EEG.T8,EEG.FC6,EEG.F4,EEG.F8,EEG.AF4
0,3641.025635,4327.692383,3980.0,4260.512695,4427.179688,4348.717773,4377.436035,3983.589844,4390.769043,3747.179443,3940.0,4461.025879,4164.615234,4194.871582
1,3630.769287,4334.358887,3973.846191,4257.436035,4426.666504,4346.153809,4382.05127,3985.641113,4378.461426,3739.487061,3926.153809,4456.410156,4154.358887,4187.692383
2,3626.666748,4340.512695,3969.743652,4256.410156,4427.692383,4351.282227,4381.538574,3988.718018,4372.820313,3735.384521,3923.589844,4456.410156,4151.794922,4181.538574
3,3623.076904,4334.871582,3964.102539,4252.307617,4428.205078,4353.333496,4377.436035,3979.487061,4377.436035,3738.461426,3921.025635,4449.230957,4147.179688,4174.871582
4,3622.05127,4331.794922,3962.05127,4249.230957,4427.179688,4355.897461,4381.538574,3976.410156,4375.897461,3737.435791,3917.435791,4445.128418,4141.025879,4175.384766


3.4. Create csv file from filtered data

In [54]:
import os
import csv  # you already imported this above

# Ensure you've created `data_last_53k` from the previous step
# and that `path` points to your original CSV.

output_path = os.path.join(os.path.dirname(path), "mina-3_eeg_modified.csv")

data_last_53k.to_csv(
    output_path,
    index=False,
    encoding="utf-8-sig",   # keeps Excel happy with UTF-8 BOM
    quoting=csv.QUOTE_MINIMAL
    # line_terminator="\n", # optional: normalize newlines
)

print(f"✅ Saved: {output_path} | shape: {data_last_53k.shape}")


✅ Saved: C:\Self Learning\Research Papers\UOW Research Papers\eSport Players with EEG Data\EEG Data\Mina 3\mina-3_eeg_modified.csv | shape: (53000, 14)


4. Create 2D array

4.1. Convert to an array

In [55]:
import numpy as np

# If you want raw strings exactly as in the CSV:
arr = data_last_53k.to_numpy(copy=True)
assert arr.shape == (53000, 14)

In [56]:
arr.shape

(53000, 14)

4.2. Transpose the array

In [57]:
# Transpose (returns a view when possible)
arr_T = arr.T
print(arr_T.shape)   # (14, 53000)

(14, 53000)


5. Segement to (14,200) shape

In [58]:
import numpy as np

seg_len = 200
channels, total_samples = arr_T.shape
n_segments = total_samples // seg_len  # 53000 // 200 = 265

# (Optional) trim if not perfectly divisible — here it is, but this keeps it robust
usable = n_segments * seg_len
arr_T_trim = arr_T[:, :usable]

# Reshape to (channels, segments, seg_len) then put segments first → (segments, channels, seg_len)
segments = arr_T_trim.reshape(channels, n_segments, seg_len).transpose(1, 0, 2)

print("segments.shape:", segments.shape)  # (265, 14, 200)


segments.shape: (265, 14, 200)


6. Add target variable

In [59]:
import numpy as np

y = np.zeros(265, dtype=np.int64)      # shape (265,)
# or as a column vector:
# y = np.zeros((265, 1), dtype=np.int64)


In [60]:
y.shape

(265,)

In [61]:
y[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

7. Save the arrays

7.1. Save the segments

In [62]:
import os
import numpy as np
import csv  # if you're also writing CSVs

# Assume: `path` points to your original CSV and `segments` is (265, 14, 200)
# (Optional) y is your labels vector (265,)

out_dir = os.path.dirname(path)

# ---- Save segments as a NumPy array (.npy) ----
adam_eeg_path = os.path.join(out_dir, "mina-3_eeg.npy")
np.save(adam_eeg_path, segments)
print(f"✅ Saved segments → {adam_eeg_path} | shape: {segments.shape}")

# ---- (Optional) Save labels next to it ----



✅ Saved segments → C:\Self Learning\Research Papers\UOW Research Papers\eSport Players with EEG Data\EEG Data\Mina 3\mina-3_eeg.npy | shape: (265, 14, 200)


7.2. Save the target array

In [63]:
#y = np.ones(segments.shape[0], dtype=np.int64)
y_path = os.path.join(out_dir, "mina-3_eeg_labels.npy")
np.save(y_path, y)
print(f"✅ Saved labels   → {y_path} | shape: {y.shape}")

✅ Saved labels   → C:\Self Learning\Research Papers\UOW Research Papers\eSport Players with EEG Data\EEG Data\Mina 3\mina-3_eeg_labels.npy | shape: (265,)
