# üì¶ Import Core Libraries and üìÇ Setup and Check Data File   
Essential libraries for data loading, exploration, and basic manipulation. 
Set the project path and check if the raw SLAM training data file exists.  
Preview the first few lines if found.


In [7]:
# -------------------- CELL 1: Import Core Libraries and Setup and Check Data File --------------------
# At the top with other imports
from pathlib import Path
import pickle
from typing import List
import sys  # Add this import

# Go up one level from notebooks/ to the project root
project_root = Path.cwd().parent
base_path = project_root / "data" / "raw" / "data_en_es"
train_file = base_path / "en_es.slam.20190204.train"

# Check if file exists
if train_file.exists():
    print("‚úÖ Train file found.")
    print("Path:", train_file)
    with open(train_file, "r", encoding="utf-8") as file:
        print("\nSample lines:")
        for _ in range(5):
            print(file.readline().strip())
else:
    raise FileNotFoundError(f"‚ùå Training file not found at {train_file}")


‚úÖ Train file found.
Path: f:\Bachleros Research\Rsearch thesis\Predicting-Churn-using-ML-and-DL\data\raw\data_en_es\en_es.slam.20190204.train

Sample lines:
# prompt:Yo soy un ni√±o.
# user:XEinXf5+  countries:CO  days:0.003  client:web  session:lesson  format:reverse_translate  time:9
DRihrVmh0101  I             PRON    Case=Nom|Number=Sing|Person=1|PronType=Prs|fPOS=PRON++PRP               nsubj        4  0
DRihrVmh0102  am            VERB    Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|fPOS=VERB++VBP    cop          4  0
DRihrVmh0103  a             DET     Definite=Ind|PronType=Art|fPOS=DET++DT                                  det          4  0


# üîÑ Parse SLAM Sessions and Save to Pickle  
This section parses the raw SLAM training file into structured sessions. Each session is a block of lines separated by an empty line.

**Steps**:
- Parse each session as a list of lines.
- Append all sessions to a main list (`slam_sessions`).
- Preview one example session.
- Save the parsed data into a `.pkl` file for reuse in later notebooks.


In [8]:
# -------------------- CELL 3: Parse SLAM Sessions -------------------
def parse_slam_sessions(filepath):
    sessions = []
    current_session = []
    with open(filepath, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line == "":
                if current_session:
                    sessions.append(current_session)
                    current_session = []
            else:
                current_session.append(line)
        if current_session:
            sessions.append(current_session)
    return sessions

# Parse and preview
slam_sessions = parse_slam_sessions(train_file)
print(f"‚úÖ Parsed sessions: {len(slam_sessions)}")

if slam_sessions:
    print("\n Sample Session:")
    for line in slam_sessions[0]:
        print(line)
else:
    print("‚ö†Ô∏è No sessions found.")

# Save parsed sessions to disk
parsed_sessions_path = project_root / "data" / "interim" / "slam_sessions.pkl"
parsed_sessions_path.parent.mkdir(parents=True, exist_ok=True)

with open(parsed_sessions_path, "wb") as f:
    pickle.dump(slam_sessions, f)

print(f"‚úÖ slam_sessions saved to: {parsed_sessions_path}")

# Validate Sessions Structure 

def validate_sessions(sessions: List[List[str]]) -> bool:
    """Validate basic session structure."""
    if not sessions:
        return False
    
    for session in sessions:
        if not session or not isinstance(session, list):
            return False
            
    return True

# Validate after parsing
if not validate_sessions(slam_sessions):
    raise ValueError("‚ùå Invalid session structure detected")
print("‚úÖ Session validation passed")

# Session Stats
def print_session_stats():
    """Print basic statistics about the parsed sessions."""
    total_sessions = len(slam_sessions)
    avg_lines = sum(len(s) for s in slam_sessions) / total_sessions
    
    print("\nüìä Session Statistics:")
    print(f"Total Sessions: {total_sessions:,}")
    print(f"Average Lines per Session: {avg_lines:.2f}")
    print(f"Memory Usage: {sys.getsizeof(slam_sessions) / (1024*1024):.2f} MB")

# Call the function
print_session_stats()


‚úÖ Parsed sessions: 824012

 Sample Session:
# prompt:Yo soy un ni√±o.
# user:XEinXf5+  countries:CO  days:0.003  client:web  session:lesson  format:reverse_translate  time:9
DRihrVmh0101  I             PRON    Case=Nom|Number=Sing|Person=1|PronType=Prs|fPOS=PRON++PRP               nsubj        4  0
DRihrVmh0102  am            VERB    Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|fPOS=VERB++VBP    cop          4  0
DRihrVmh0103  a             DET     Definite=Ind|PronType=Art|fPOS=DET++DT                                  det          4  0
DRihrVmh0104  boy           NOUN    Number=Sing|fPOS=NOUN++NN                                               ROOT         0  0
‚úÖ slam_sessions saved to: f:\Bachleros Research\Rsearch thesis\Predicting-Churn-using-ML-and-DL\data\interim\slam_sessions.pkl
‚úÖ Session validation passed

üìä Session Statistics:
Total Sessions: 824,012
Average Lines per Session: 4.91
Memory Usage: 6.37 MB


# üìã Notebook Summary

This notebook accomplishes:
1. **Data Loading**: Loads raw SLAM session data
2. **Data Parsing**: Converts raw text into structured sessions
3. **Validation**: Ensures data integrity and structure
4. **Statistics**: Provides session count and memory usage
5. **Storage**: Saves processed data for next steps

Next notebook: `02_preprocessing_feature_eng.ipynb`