1. Import Libraries

In [33]:
import pandas as pd
import csv

2. Load the EEG Data

In [34]:
import pandas as pd                 # Pandas for data handling
import re                           # Regex for whitespace-delimited detection
from collections import Counter     # (Optional) simple stats if you want to extend detection

# -------- CONFIG --------
path = r"C:\Self Learning\Research Papers\UOW Research Papers\eSport Players with EEG Data\EEG Data\Amin\Amin_EPOC_186248_2023.05.25T01.07.13+01.00.md.mc.pm.fe.bp.csv"  # Absolute path
expected_min_cols = 170            # A..FN inclusive => 170 columns
encodings_to_try = ["utf-8-sig", "utf-8", "latin-1"]  # Common encodings to try

# Candidate separators. r"\s+" handles variable-width whitespace.
candidates = {
    ",":       {"is_regex": False},  # Comma-separated
    ";":       {"is_regex": False},  # Semicolon-separated
    "\t":      {"is_regex": False},  # Tab-separated
    "|":       {"is_regex": False},  # Pipe-separated
    r"\s+":    {"is_regex": True},   # Whitespace-separated (regex)
}

# -------- STEP 1: decode a text sample for analysis --------
raw = None                                           # Will hold file bytes
text = None                                          # Will hold decoded text
used_encoding = None                                  # Remember which encoding worked

for enc in encodings_to_try:                          # Try each encoding in order
    try:
        with open(path, "rb") as f:                   # Open in binary mode
            raw = f.read()                            # Read all bytes
        text = raw.decode(enc, errors="strict")       # Attempt to decode
        used_encoding = enc                           # Keep the working encoding
        break                                         # Stop on first success
    except Exception:
        continue                                      # Try the next encoding if this one fails

if text is None:                                      # If all decodes failed, bail out
    raise RuntimeError("Could not decode file with utf-8-sig, utf-8, or latin-1.")

lines = text.splitlines()                             # Split decoded text into lines
sample_n = min(200, len(lines))                       # Cap sample to first 200 lines
sample_lines = lines[:sample_n]                       # Take the sample slice

# -------- STEP 2: detect the header line and the best delimiter --------
def field_count(line, sep, is_regex):
    """Return number of fields when splitting a line by sep (regex or literal)."""
    if not line.strip():                               # Ignore blank/whitespace-only lines
        return 0
    if is_regex:                                       # Regex-based split (e.g., r"\s+")
        parts = re.split(sep, line.strip())
    else:                                              # Literal separator
        parts = line.strip().split(sep)
    return len(parts)

best = None                                            # Will store (max_fields, line_idx, sep, is_regex)

for idx, line in enumerate(sample_lines):              # Check each sample line
    if not line.strip():                               # Skip empty lines
        continue
    for sep, meta in candidates.items():               # Try each separator on this line
        cnt = field_count(line, sep, meta["is_regex"]) # Count fields produced
        if best is None or cnt > best[0]:              # Keep the best (max) so far
            best = (cnt, idx, sep, meta["is_regex"])   # Save (count, header line idx, sep, is_regex)

if best is None:                                       # Sanity: ensure we found something
    raise RuntimeError("Could not identify any plausible header line.")

best_count, header_idx, best_sep, best_is_regex = best # Unpack detection result

if best_count < 50:                                    # Heuristic: very few fields might be suspicious
    print(f"⚠️ Detected only {best_count} fields on line {header_idx}. Continuing anyway.")

# -------- STEP 3: read the file from the detected header line --------
read_kwargs = {
    "encoding": used_encoding,                         # Use the successful encoding
    "header": 0,                                       # Treat detected header as header row
    "skiprows": header_idx,                            # Skip lines before the header
    "dtype": str,                                      # Read everything as string to preserve columns 1:1
    "quotechar": '"',                                  # Standard quoting
    "doublequote": True,                               # Handle "" inside quoted fields
    "na_filter": False,                                # Keep empty strings as empty, not NaN
}

if best_is_regex:                                      # If sep is regex (\s+), use Python engine
    read_kwargs["sep"] = best_sep
    read_kwargs["engine"] = "python"
else:                                                  # Otherwise prefer fast C engine
    read_kwargs["sep"] = best_sep
    read_kwargs["engine"] = "c"

Amin = pd.read_csv(path, **read_kwargs)                # Load the dataset as requested (name = Amin)

# -------- STEP 4: diagnostics and hard check --------
print(f"Detected encoding: {used_encoding}")           # Show encoding used
print(f"Detected header line index: {header_idx}")     # Show header line number (0-based)
print(f"Detected separator: {'REGEX ' if best_is_regex else ''}{best_sep}")  # Show separator
print(f"Shape: {Amin.shape}")                          # Print (rows, columns)

if Amin.shape[1] < expected_min_cols:                  # If still short, print header preview for debugging
    hdr_preview = sample_lines[header_idx][:200].replace("\t", "\\t")
    print(f"⚠️ Columns < {expected_min_cols}. Header preview: {hdr_preview}")

assert Amin.shape[1] >= expected_min_cols, (           # Enforce A..FN (170) columns
    f"Only {Amin.shape[1]} columns parsed; expected ≥ {expected_min_cols} (A..FN)."
)

# Quick peek at what we got
print(Amin.columns.tolist()[:12])                      # First 12 column names
print(Amin.head(3))                                    # First 3 rows


Detected encoding: utf-8-sig
Detected header line index: 1
Detected separator: ,
Shape: (58439, 170)
['Timestamp', 'OriginalTimestamp', 'EEG.Counter', 'EEG.Interpolated', 'EEG.AF3', 'EEG.F7', 'EEG.F3', 'EEG.FC5', 'EEG.T7', 'EEG.P7', 'EEG.O1', 'EEG.O2']
           Timestamp  OriginalTimestamp EEG.Counter EEG.Interpolated  \
0  1684973233.901024  1684973233.901378   96.000000         0.000000   
1  1684973233.908844  1684973233.909298   97.000000         0.000000   
2  1684973233.916663  1684973233.917117   98.000000         0.000000   

       EEG.AF3       EEG.F7       EEG.F3      EEG.FC5       EEG.T7  \
0  3738.461426  4280.000000  3955.384521  4273.846191  4424.615234   
1  3732.820557  4277.436035  3947.692383  4276.922852  4423.589844   
2  3730.256348  4277.948730  3949.743652  4280.000000  4425.128418   

        EEG.P7       EEG.O1       EEG.O2       EEG.P8       EEG.T8  \
0  4413.333496  4372.820313  4087.692383  4388.205078  3737.948730   
1  4406.666504  4382.563965  4076.410

In [35]:
data = Amin.copy()

3. Data Preprocessing

3.1. Extract the information of the dataset

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58439 entries, 0 to 58438
Columns: 170 entries, Timestamp to POW.AF4.Gamma
dtypes: object(170)
memory usage: 75.8+ MB


In [37]:
data.columns

Index(['Timestamp', 'OriginalTimestamp', 'EEG.Counter', 'EEG.Interpolated',
       'EEG.AF3', 'EEG.F7', 'EEG.F3', 'EEG.FC5', 'EEG.T7', 'EEG.P7',
       ...
       'POW.F8.Theta', 'POW.F8.Alpha', 'POW.F8.BetaL', 'POW.F8.BetaH',
       'POW.F8.Gamma', 'POW.AF4.Theta', 'POW.AF4.Alpha', 'POW.AF4.BetaL',
       'POW.AF4.BetaH', 'POW.AF4.Gamma'],
      dtype='object', length=170)

In [38]:
pd.set_option('display.max_columns', None)  # Show every column (no truncation)
pd.set_option('display.width', 0)           # Let pandas auto-size the table to the notebook width
data.head()

Unnamed: 0,Timestamp,OriginalTimestamp,EEG.Counter,EEG.Interpolated,EEG.AF3,EEG.F7,EEG.F3,EEG.FC5,EEG.T7,EEG.P7,EEG.O1,EEG.O2,EEG.P8,EEG.T8,EEG.FC6,EEG.F4,EEG.F8,EEG.AF4,EEG.RawCq,EEG.Battery,EEG.BatteryPercent,MarkerIndex,MarkerType,MarkerValueInt,EEG.MarkerHardware,CQ.AF3,CQ.F7,CQ.F3,CQ.FC5,CQ.T7,CQ.P7,CQ.O1,CQ.O2,CQ.P8,CQ.T8,CQ.FC6,CQ.F4,CQ.F8,CQ.AF4,CQ.Overall,EQ.SampleRateQuality,EQ.OVERALL,EQ.AF3,EQ.F7,EQ.F3,EQ.FC5,EQ.T7,EQ.P7,EQ.O1,EQ.O2,EQ.P8,EQ.T8,EQ.FC6,EQ.F4,EQ.F8,EQ.AF4,MOT.CounterMems,MOT.InterpolatedMems,MOT.GyroX,MOT.GyroY,MC.Action,MC.ActionPower,MC.IsActive,PM.Engagement.IsActive,PM.Engagement.Scaled,PM.Engagement.Raw,PM.Engagement.Min,PM.Engagement.Max,PM.Excitement.IsActive,PM.Excitement.Scaled,PM.Excitement.Raw,PM.Excitement.Min,PM.Excitement.Max,PM.LongTermExcitement,PM.Stress.IsActive,PM.Stress.Scaled,PM.Stress.Raw,PM.Stress.Min,PM.Stress.Max,PM.Relaxation.IsActive,PM.Relaxation.Scaled,PM.Relaxation.Raw,PM.Relaxation.Min,PM.Relaxation.Max,PM.Interest.IsActive,PM.Interest.Scaled,PM.Interest.Raw,PM.Interest.Min,PM.Interest.Max,PM.Focus.IsActive,PM.Focus.Scaled,PM.Focus.Raw,PM.Focus.Min,PM.Focus.Max,FE.BlinkWink,FE.HorizontalEyesDirection,FE.UpperFaceAction,FE.UpperFaceActionPower,FE.LowerFaceAction,FE.LowerFaceActionPower,POW.AF3.Theta,POW.AF3.Alpha,POW.AF3.BetaL,POW.AF3.BetaH,POW.AF3.Gamma,POW.F7.Theta,POW.F7.Alpha,POW.F7.BetaL,POW.F7.BetaH,POW.F7.Gamma,POW.F3.Theta,POW.F3.Alpha,POW.F3.BetaL,POW.F3.BetaH,POW.F3.Gamma,POW.FC5.Theta,POW.FC5.Alpha,POW.FC5.BetaL,POW.FC5.BetaH,POW.FC5.Gamma,POW.T7.Theta,POW.T7.Alpha,POW.T7.BetaL,POW.T7.BetaH,POW.T7.Gamma,POW.P7.Theta,POW.P7.Alpha,POW.P7.BetaL,POW.P7.BetaH,POW.P7.Gamma,POW.O1.Theta,POW.O1.Alpha,POW.O1.BetaL,POW.O1.BetaH,POW.O1.Gamma,POW.O2.Theta,POW.O2.Alpha,POW.O2.BetaL,POW.O2.BetaH,POW.O2.Gamma,POW.P8.Theta,POW.P8.Alpha,POW.P8.BetaL,POW.P8.BetaH,POW.P8.Gamma,POW.T8.Theta,POW.T8.Alpha,POW.T8.BetaL,POW.T8.BetaH,POW.T8.Gamma,POW.FC6.Theta,POW.FC6.Alpha,POW.FC6.BetaL,POW.FC6.BetaH,POW.FC6.Gamma,POW.F4.Theta,POW.F4.Alpha,POW.F4.BetaL,POW.F4.BetaH,POW.F4.Gamma,POW.F8.Theta,POW.F8.Alpha,POW.F8.BetaL,POW.F8.BetaH,POW.F8.Gamma,POW.AF4.Theta,POW.AF4.Alpha,POW.AF4.BetaL,POW.AF4.BetaH,POW.AF4.Gamma
0,1684973233.901024,1684973233.901378,96.0,0.0,3738.461426,4280.0,3955.384521,4273.846191,4424.615234,4413.333496,4372.820313,4087.692383,4388.205078,3737.94873,3930.769287,4492.820313,4367.179688,4301.025879,513.0,4.0,82.0,,,,0.0,4.0,4.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,33.333332,,,,,,,,,,,,,,,,,96.0,0.0,2194.0,1470.0,1.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,1.0,0.0,1.253787,0.311837,0.45141,0.533792,0.138227,0.97574,0.50807,0.551606,0.327693,0.170843,1.53788,0.631264,0.618583,0.626029,0.15653,1.074917,0.37998,0.680785,0.693483,0.143058,0.028307,0.027829,0.011013,0.016435,0.020652,1.972713,0.458112,0.210488,0.202173,0.153115,6.518922,2.366011,4.887473,1.740104,1.238684,5.129272,2.346351,4.312412,2.366726,1.033189,4.073516,3.045308,2.607798,0.943011,0.297024,0.010182,0.011636,0.006908,0.00454,0.004822,2.05253,1.649055,1.080276,0.640947,0.484667,1.858591,0.884606,0.928668,0.455507,0.247873,2.345176,1.169377,1.031834,0.800178,0.25618,1.737899,0.712493,0.761558,0.549859,0.188794
1,1684973233.908844,1684973233.909298,97.0,0.0,3732.820557,4277.436035,3947.692383,4276.922852,4423.589844,4406.666504,4382.563965,4076.410156,4352.820313,3736.410156,3921.025635,4475.897461,4361.538574,4291.794922,476.0,4.0,82.0,,,,0.0,4.0,4.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,33.333332,,,,,,,,,,,,,,,,,97.0,0.0,2192.0,1468.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1684973233.916663,1684973233.917117,98.0,0.0,3730.256348,4277.94873,3949.743652,4280.0,4425.128418,4405.128418,4380.512695,4054.358887,4305.128418,3736.410156,3904.102539,4454.871582,4355.384766,4282.563965,514.0,4.0,82.0,,,,0.0,4.0,4.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,33.333332,,,,,,,,,,,,,,,,,98.0,0.0,2193.0,1469.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1684973233.924483,1684973233.925037,99.0,0.0,3727.692383,4266.153809,3953.846191,4269.743652,4426.666504,4402.563965,4370.769043,4052.307617,4313.333496,3736.923096,3904.615479,4453.846191,4361.538574,4279.487305,530.0,4.0,82.0,,,,0.0,4.0,4.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,33.333332,,,,,,,,,,,,,,,,,99.0,0.0,2193.0,1468.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1684973233.932303,1684973233.932857,100.0,0.0,3723.589844,4261.025879,3945.641113,4260.0,4424.102539,4397.94873,4369.743652,4065.128174,4341.025879,3736.923096,3913.333252,4467.692383,4367.692383,4280.0,513.0,4.0,82.0,,,,0.0,4.0,4.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,33.333332,,,,,,,,,,,,,,,,,100.0,0.0,2193.0,1469.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


3.2. Keep only the columns from E-R

In [39]:
# Keep only columns E..R (Excel letters). 
# A=0,B=1,C=2,D=3,E=4,...,R=17 (0-based). iloc stop is exclusive, so use 18.
start_idx = 4           # E
stop_idx_exclusive = 18 # R + 1

# Clamp the stop index in case the file has fewer columns
stop_idx_exclusive = min(stop_idx_exclusive, data.shape[1])

data = data.iloc[:, start_idx:stop_idx_exclusive]

# (Optional) quick check
print(data.shape)

(58439, 14)


In [40]:
data.head()

Unnamed: 0,EEG.AF3,EEG.F7,EEG.F3,EEG.FC5,EEG.T7,EEG.P7,EEG.O1,EEG.O2,EEG.P8,EEG.T8,EEG.FC6,EEG.F4,EEG.F8,EEG.AF4
0,3738.461426,4280.0,3955.384521,4273.846191,4424.615234,4413.333496,4372.820313,4087.692383,4388.205078,3737.94873,3930.769287,4492.820313,4367.179688,4301.025879
1,3732.820557,4277.436035,3947.692383,4276.922852,4423.589844,4406.666504,4382.563965,4076.410156,4352.820313,3736.410156,3921.025635,4475.897461,4361.538574,4291.794922
2,3730.256348,4277.94873,3949.743652,4280.0,4425.128418,4405.128418,4380.512695,4054.358887,4305.128418,3736.410156,3904.102539,4454.871582,4355.384766,4282.563965
3,3727.692383,4266.153809,3953.846191,4269.743652,4426.666504,4402.563965,4370.769043,4052.307617,4313.333496,3736.923096,3904.615479,4453.846191,4361.538574,4279.487305
4,3723.589844,4261.025879,3945.641113,4260.0,4424.102539,4397.94873,4369.743652,4065.128174,4341.025879,3736.923096,3913.333252,4467.692383,4367.692383,4280.0


* First row is the header row. so it will not be applied in the simulation

3.3. Extract last 53k rows

In [41]:
data_last_53k = data.tail(53000).reset_index(drop=True)

print("data_last_53k shape:", data_last_53k.shape)

data_last_53k shape: (53000, 14)


In [42]:
data_last_53k.head()

Unnamed: 0,EEG.AF3,EEG.F7,EEG.F3,EEG.FC5,EEG.T7,EEG.P7,EEG.O1,EEG.O2,EEG.P8,EEG.T8,EEG.FC6,EEG.F4,EEG.F8,EEG.AF4
0,3682.05127,4269.743652,3976.410156,4320.512695,4426.153809,4305.128418,4354.358887,3944.615479,4223.077148,3736.923096,3911.281982,4465.641113,4141.025879,4236.410156
1,3682.05127,4265.641113,3973.333252,4320.512695,4426.153809,4306.666504,4362.563965,3947.692383,4221.538574,3736.923096,3911.794922,4471.794922,4140.512695,4233.846191
2,3681.538574,4262.563965,3971.281982,4315.384766,4426.666504,4306.153809,4353.333496,3944.615479,4220.0,3736.923096,3910.256348,4474.358887,4141.538574,4236.410156
3,3683.076904,4264.102539,3975.384521,4313.333496,4427.692383,4306.153809,4343.077148,3944.615479,4223.589844,3736.923096,3918.461426,4477.436035,4144.102539,4249.230957
4,3675.384521,4258.461426,3970.256348,4308.205078,4427.179688,4309.230957,4341.538574,3941.538574,4219.487305,3736.923096,3921.025635,4482.05127,4140.512695,4247.692383


3.4. Create csv file from filtered data

In [43]:
import os
import csv  # you already imported this above

# Ensure you've created `data_last_53k` from the previous step
# and that `path` points to your original CSV.

output_path = os.path.join(os.path.dirname(path), "amin_eeg_modified.csv")

data_last_53k.to_csv(
    output_path,
    index=False,
    encoding="utf-8-sig",   # keeps Excel happy with UTF-8 BOM
    quoting=csv.QUOTE_MINIMAL
    # line_terminator="\n", # optional: normalize newlines
)

print(f"✅ Saved: {output_path} | shape: {data_last_53k.shape}")


✅ Saved: C:\Self Learning\Research Papers\UOW Research Papers\eSport Players with EEG Data\EEG Data\Amin\amin_eeg_modified.csv | shape: (53000, 14)


4. Create 2D array

4.1. Convert to an array

In [44]:
import numpy as np

# If you want raw strings exactly as in the CSV:
arr = data_last_53k.to_numpy(copy=True)
assert arr.shape == (53000, 14)

In [45]:
arr.shape

(53000, 14)

4.2. Transpose the array

In [46]:
# Transpose (returns a view when possible)
arr_T = arr.T
print(arr_T.shape)   # (14, 53000)

(14, 53000)


5. Segement to (14,200) shape

In [47]:
import numpy as np

seg_len = 200
channels, total_samples = arr_T.shape
n_segments = total_samples // seg_len  # 53000 // 200 = 265

# (Optional) trim if not perfectly divisible — here it is, but this keeps it robust
usable = n_segments * seg_len
arr_T_trim = arr_T[:, :usable]

# Reshape to (channels, segments, seg_len) then put segments first → (segments, channels, seg_len)
segments = arr_T_trim.reshape(channels, n_segments, seg_len).transpose(1, 0, 2)

print("segments.shape:", segments.shape)  # (265, 14, 200)


segments.shape: (265, 14, 200)


6. Add target variable

In [48]:
import numpy as np

y = np.zeros(265, dtype=np.int64)      # shape (265,)
# or as a column vector:
# y = np.zeros((265, 1), dtype=np.int64)


In [49]:
y.shape

(265,)

In [50]:
y[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

7. Save the arrays

7.1. Save the segments

In [51]:
import os
import numpy as np
import csv  # if you're also writing CSVs

# Assume: `path` points to your original CSV and `segments` is (265, 14, 200)
# (Optional) y is your labels vector (265,)

out_dir = os.path.dirname(path)

# ---- Save segments as a NumPy array (.npy) ----
adam_eeg_path = os.path.join(out_dir, "amin_eeg.npy")
np.save(adam_eeg_path, segments)
print(f"✅ Saved segments → {adam_eeg_path} | shape: {segments.shape}")

# ---- (Optional) Save labels next to it ----



✅ Saved segments → C:\Self Learning\Research Papers\UOW Research Papers\eSport Players with EEG Data\EEG Data\Amin\amin_eeg.npy | shape: (265, 14, 200)


7.2. Save the target array

In [52]:
y_path = os.path.join(out_dir, "amin_eeg_labels.npy")
np.save(y_path, y)
print(f"✅ Saved labels   → {y_path} | shape: {y.shape}")

✅ Saved labels   → C:\Self Learning\Research Papers\UOW Research Papers\eSport Players with EEG Data\EEG Data\Amin\amin_eeg_labels.npy | shape: (265,)
