In [None]:
import pandas as pd
import re

# Load CSV file
csv_file = "train (1).csv"  # Update with your file name
df = pd.read_csv(csv_file, encoding="utf-8")

# Ensure necessary columns exist
if "evaluation" not in df.columns or "essay" not in df.columns:
    raise ValueError("The required columns ('evaluation', 'essay') are missing from the dataset!")

# Function to clean text (removes *, #, [, ])
def clean_text(text):
    """Removes unwanted characters and extra spaces."""
    if isinstance(text, str):
        return re.sub(r'[\*\#\[\]]+', '', text).strip()
    return text

# Function to extract the **last** band score for each section
def extract_band_scores(text):
    """Extracts the last occurrence of each score category and the next number following it."""
    scores = {
        "Task Achievement": None,
        "Coherence": None,
        "Lexical Resource": None,
        "Grammar": None,
        "Overall Band Score": None
    }

    if not isinstance(text, str) or not text.strip():
        return scores  # Skip empty evaluations

    # Define regex patterns for extracting scores
    patterns = {
        "Task Achievement": r"(?:Task Achievement|Task Response)[^\d]*(\d+(?:\.\d+)?)",
        "Coherence": r"(?:Coherence and Cohesion|Coherence)[^\d]*(\d+(?:\.\d+)?)",
        "Lexical Resource": r"(?:Lexical Resource|Vocabulary)[^\d]*(\d+(?:\.\d+)?)",
        "Grammar": r"(?:Grammatical Range and Accuracy|Grammar)[^\d]*(\d+(?:\.\d+)?)",
        "Overall Band Score": r"(?:Overall Band Score|Final Band Score|Band Score \(Overall\))[^\d]*(\d+(?:\.\d+)?)"
    }

    # Extract the last occurrence of each score
    for category, pattern in patterns.items():
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            try:
                scores[category] = float(matches[-1])  # Take the last match
            except ValueError:
                pass  # Avoid crashes on invalid conversions

    return scores

# Function to calculate text statistics **without nltk**
def calculate_text_stats(essay):
    """Calculate word count, sentence count, and average sentence length without nltk."""
    if not isinstance(essay, str) or not essay.strip():
        return {"word_count": 0, "sentence_count": 0, "avg_sentence_length": 0}

    # Word count using .split() (splits on whitespace)
    words = essay.split()
    word_count = len(words)

    # Sentence count using regex (handles multiple punctuation styles)
    sentences = re.split(r'[.!?]', essay)
    sentence_count = sum(1 for s in sentences if s.strip())  # Count non-empty sentences

    # Average sentence length (rounded to the nearest whole number)
    avg_sentence_length = round(word_count / max(sentence_count, 1))  # Avoid division by zero

    return {
        "word_count": word_count,
        "sentence_count": sentence_count,
        "avg_sentence_length": avg_sentence_length
    }

# Apply text cleaning function
df["cleaned_evaluation"] = df["evaluation"].apply(clean_text)  # Clean text before processing

# Apply function to extract band scores
score_df = df["cleaned_evaluation"].apply(extract_band_scores).apply(pd.Series)

# Apply function to calculate text statistics
text_stats_df = df["essay"].apply(calculate_text_stats).apply(pd.Series)

# Merge extracted scores and text stats into the main DataFrame
df = pd.concat([df, score_df, text_stats_df], axis=1)

# Drop rows where **any** score column is missing
score_columns = ["Task Achievement", "Coherence", "Lexical Resource", "Grammar", "Overall Band Score"]
df_filtered = df.dropna(subset=score_columns, how="any")  # Remove rows if any score is missing

# Save the processed dataset
output_csv = "processed_train_data.csv"
df_filtered.to_csv(output_csv, index=False, encoding="utf-8")

print(f"✅ Processed dataset saved as {output_csv}")
