In [1]:
!pip install pandas nltk python-Levenshtein
!python -m nltk.downloader wordnet punkt

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages

### Imports

In [2]:
import pandas as pd
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import Levenshtein
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

Load Data and Format

In [3]:
def load_data(notes_path, labels_path):
    encodings = ['utf-8', 'cp1252', 'latin1', 'iso-8859-1']

    for encoding in encodings:
        try:
            notes_df = pd.read_csv(notes_path, encoding=encoding)
            labels_df = pd.read_csv(labels_path, encoding=encoding)
            break
        except UnicodeDecodeError:
            continue

    # Merge notes with labels using ID only
    merged = labels_df.merge(
        notes_df,
        how='left',
        on=['ID'],
        suffixes=('', '_note')
    )

    # Extract the correct segment notes
    merged['Note'] = merged.apply(
        lambda x: x[f'Segment{x.Segment}_Notes'] if pd.notna(x[f'Segment{x.Segment}_Notes']) else "",
        axis=1
    )

    return merged[['ID', 'Segment', 'IdeaUnit', 'Note', 'label']]

### Preprocess all text:

In [4]:
lemmatizer = WordNetLemmatizer()
def preprocess(text):
    """Clean and tokenize text."""
    if not isinstance(text, str):
        return []

    # Standardize text
    text = (text.lower()
            .replace("ex)", "example")
            .replace("-", " ")
            .replace(":", " "))

    # Tokenize and remove punctuation
    words = [w for w in word_tokenize(text) if w.isalnum()]

    # Lemmatize (reduce to base form)
    return [lemmatizer.lemmatize(w) for w in words]

## Rules for determining 1 or 0

In [5]:
def get_synonyms(word):
    """Get synonyms using WordNet."""
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().lower())
    return synonyms

def keyword_overlap(idea_unit, note, threshold=0.5):
    """Check if sufficient keywords match between IdeaUnit and Note."""
    idea_words = preprocess(idea_unit)
    note_words = preprocess(note)

    if not idea_words:
        return False

    # Expand with synonyms
    synonym_expanded = set(idea_words)
    for word in idea_words:
        synonym_expanded.update(get_synonyms(word))

    # Count overlaps
    overlap = len([w for w in note_words if w in synonym_expanded])
    return (overlap / len(idea_words)) >= threshold

#Verify proper nouns and technical terms are correctly spelled.

def strict_typo_check(idea_unit, note, max_distance=2):
    idea_proper_nouns = [w for w in preprocess(idea_unit) if w[0].isupper()]

    for pn in idea_proper_nouns:
        # Find closest match in note
        note_words = preprocess(note)
        closest_match = min(
            [Levenshtein.distance(pn, nw) for nw in note_words],
            default=max_distance+1
        )

        if closest_match > max_distance:
            return False
    return True

In [15]:
def predict_label(idea_unit, note, threshold=0.5):
    """Rule-based classifier."""
    # Negative rules
    if len(preprocess(note)) < 5:
        return 0

    if not strict_typo_check(idea_unit, note):
        return 0

    # Positive rules
    if keyword_overlap(idea_unit, note, threshold):
        return 1

    return 0

In [13]:
def print_metrics(y_true, y_pred):
    """Print classification metrics."""
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)
    print(f"\nAccuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_true, y_pred):.4f}")

In [7]:
# Load data (example paths)
train_data = load_data("/content/drive/MyDrive/FinalProj/Notes.csv", "/content/drive/MyDrive/FinalProj/train.csv")
test_data = load_data("/content/drive/MyDrive/FinalProj/Notes.csv", "/content/drive/MyDrive/FinalProj/test.csv")
# Train threshold optimization
best_threshold = 0.5
best_f1 = 0

In [16]:
for threshold in np.arange(0.3, 0.71, 0.02):
    train_data['pred'] = train_data.apply(
        lambda x: predict_label(x['IdeaUnit'], x['Note'], threshold),
        axis=1
    )

    f1 = f1_score(train_data['label'], train_data['pred'])
    print(f"Threshold {threshold}: F1 = {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"\nBest threshold: {best_threshold} (F1={best_f1:.4f})")

# Evaluate on training data
print("\nTraining Set Performance:")
print_metrics(train_data['label'], train_data['pred'])

# Test evaluation
print("\nEvaluating on test data...")
# Drop NaN labels (recommended)
test_data_clean = test_data.dropna(subset=['label'])

# Generate predictions
test_data_clean['pred'] = test_data_clean.apply(
    lambda x: predict_label(x['IdeaUnit'], x['Note'], best_threshold),
    axis=1
)


print("\nTest Set Performance:")
print_metrics(test_data_clean['label'],test_data_clean['pred'])

# Save predictions
test_data_clean[['ID', 'Segment', 'IdeaUnit', 'pred']].to_csv("predictions.csv", index=False)
print("\nPredictions saved to 'predictions2.csv'")

Threshold 0.3: F1 = 0.6630
Threshold 0.32: F1 = 0.6685
Threshold 0.34: F1 = 0.6686
Threshold 0.36000000000000004: F1 = 0.6648
Threshold 0.38000000000000006: F1 = 0.6648
Threshold 0.4000000000000001: F1 = 0.6764
Threshold 0.4200000000000001: F1 = 0.6764
Threshold 0.4400000000000001: F1 = 0.6667
Threshold 0.46000000000000013: F1 = 0.6667
Threshold 0.48000000000000015: F1 = 0.6727
Threshold 0.5000000000000002: F1 = 0.6667
Threshold 0.5200000000000002: F1 = 0.6667
Threshold 0.5400000000000003: F1 = 0.6709
Threshold 0.5600000000000003: F1 = 0.6645
Threshold 0.5800000000000003: F1 = 0.6623
Threshold 0.6000000000000003: F1 = 0.6579
Threshold 0.6200000000000003: F1 = 0.6645
Threshold 0.6400000000000003: F1 = 0.6554
Threshold 0.6600000000000004: F1 = 0.6554
Threshold 0.6800000000000004: F1 = 0.6621
Threshold 0.7000000000000004: F1 = 0.6644

Best threshold: 0.4000000000000001 (F1=0.6764)

Training Set Performance:
Confusion Matrix:
[[60 71]
 [27 97]]

Accuracy: 0.6157
Precision: 0.5774
Recall: 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_clean['pred'] = test_data_clean.apply(


In [None]:
print(f"Missing labels in test: {test_data['label'].isna().sum()}")

Missing labels in test: 4


In [11]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True