# Results

In [None]:
import csv
import re

def parse_model_name(model_name):
    """
    Given something like 'b25-CBOW-64-5-10', return
    (name, algo, vector_size, epochs, window).
    """
    # Drop any leading/trailing whitespace
    model_name = model_name.strip()
    # model_name is something like 'b25-CBOW-64-5-10'
    parts = model_name.split('-')
    
    # Adjust this parsing based on how consistent your naming is
    # Example: parts -> ['b25', 'CBOW', '64', '5', '10']
    name = parts[0]            # b25
    algo = parts[1]            # CBOW
    vector_size = parts[2]     # 64
    epochs = parts[3]          # 5
    window = parts[4]          # 10
    
    return name, algo, vector_size, epochs, window

def txt_to_csv(input_txt_path, output_csv_path):
    """
    Reads the text file at input_txt_path and writes out a CSV at output_csv_path.
    Each row contains: model_name, algo, vector_size, epochs, window, precision, recall, f1.
    """
    rows = []
    
    # Regex to match lines that look like "Testing b25-CBOW-64-5-10"
    testing_pattern = re.compile(r"^Testing\s+(.+)$")
    
    # Regex to match lines like "Precision: 0.0417"
    precision_pattern = re.compile(r"^\s*Precision:\s*([0-9]*\.?[0-9]+)\s*$")
    recall_pattern = re.compile(r"^\s*Recall:\s*([0-9]*\.?[0-9]+)\s*$")
    f1_pattern = re.compile(r"^\s*F1:\s*([0-9]*\.?[0-9]+)\s*$")
    
    current_model_name = None
    current_precision = None
    current_recall = None
    current_f1 = None
    
    with open(input_txt_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                # skip empty lines
                continue
            
            # Check if line starts with "Testing ..."
            testing_match = testing_pattern.match(line)
            if testing_match:
                # If we already had a previous model and have all metrics,
                # we can store that row first.
                if (current_model_name is not None and 
                    current_precision is not None and
                    current_recall is not None and 
                    current_f1 is not None):
                    
                    name, algo, vector_size, epochs, window = parse_model_name(current_model_name)
                    
                    rows.append([
                        current_model_name, algo, vector_size, epochs, window,
                        current_precision, current_recall, current_f1
                    ])
                
                # Start a new block
                current_model_name = testing_match.group(1)
                current_precision = None
                current_recall = None
                current_f1 = None
                
            else:
                # Otherwise, try to match precision, recall, or F1
                p_match = precision_pattern.match(line)
                if p_match:
                    current_precision = p_match.group(1)
                    continue
                
                r_match = recall_pattern.match(line)
                if r_match:
                    current_recall = r_match.group(1)
                    continue
                
                f_match = f1_pattern.match(line)
                if f_match:
                    current_f1 = f_match.group(1)
                    continue
    
    # End of file – if there's a model pending, store it:
    if (current_model_name is not None and 
        current_precision is not None and
        current_recall is not None and 
        current_f1 is not None):
        
        name, algo, vector_size, epochs, window = parse_model_name(current_model_name)
        
        rows.append([
            current_model_name, algo, vector_size, epochs, window,
            current_precision, current_recall, current_f1
        ])
    
    # Write out CSV
    with open(output_csv_path, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        # Write header
        writer.writerow(["model_name", "algo", "vector_size", "epochs", "window",
                         "precision", "recall", "f1"])
        for row in rows:
            writer.writerow(row)

# Example usage:
if __name__ == "__main__":
    txt_to_csv("../data/unstructured/txt_results.txt", "output.csv")
    print("Done! Created output.csv.")


## Loading the data into df

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from io import StringIO
import plotly.express as px

df = pd.read_csv("../data/csv/model_results/output.csv")

In [2]:
# F1 vs. Window (hover_data will show model_name)
fig_window = px.scatter(df, 
                        x='window', 
                        y='f1', 
                        hover_data=['model_name'],  # what to show on hover
                        title='F1 vs Window Size')
fig_window.show()

# F1 vs. Vector Size
fig_vector = px.scatter(df, 
                        x='vector_size', 
                        y='f1', 
                        hover_data=['model_name'], 
                        title='F1 vs Vector Size')
fig_vector.show()