<a href="https://colab.research.google.com/github/its3alih/Thesis/blob/main/Ridge_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##FIRST

In [None]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data
sentences, tags = load_excel_data("/content/IO.xlsx")  # Adjust path if needed
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Build and train RidgeClassifier model
vec = DictVectorizer(sparse=True)
clf = RidgeClassifier()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = pipeline.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


🔍 Classification Report:
              precision    recall  f1-score   support

           I       0.94      0.94      0.94       548
           O       1.00      1.00      1.00     11201

    accuracy                           0.99     11749
   macro avg       0.97      0.97      0.97     11749
weighted avg       0.99      0.99      0.99     11749

📊 Evaluation Results:
Accuracy: 0.9945
Precision: 0.9945
Recall: 0.9945
F1 Score: 0.9945


In [None]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data (BI tagging scheme)
sentences, tags = load_excel_data("/content/BI.xlsx")  # ← updated path to BI.xlsx
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Build and train RidgeClassifier model
vec = DictVectorizer(sparse=True)
clf = RidgeClassifier()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = pipeline.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


🔍 Classification Report:
              precision    recall  f1-score   support

           B       0.97      0.97      0.97       251
          BO       0.70      0.69      0.69       197
           I       0.91      0.90      0.91       294
          IO       0.99      0.99      0.99     11007

    accuracy                           0.98     11749
   macro avg       0.89      0.89      0.89     11749
weighted avg       0.98      0.98      0.98     11749

📊 Evaluation Results:
Accuracy: 0.9841
Precision: 0.9840
Recall: 0.9841
F1 Score: 0.9840


In [None]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data (BIES tagging scheme)
sentences, tags = load_excel_data("/content/BIES.xlsx")  # ← Path to BIES tagging file
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Build and train RidgeClassifier model
vec = DictVectorizer(sparse=True)
clf = RidgeClassifier()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = pipeline.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


🔍 Classification Report:
              precision    recall  f1-score   support

           B       0.97      0.97      0.97       247
          BO       0.74      0.62      0.67       190
           E       0.88      0.89      0.89       248
          EO       0.95      0.94      0.94       257
           I       0.92      0.78      0.85        46
          IO       0.99      0.99      0.99     10750
           S       1.00      1.00      1.00         4
          SO       1.00      0.29      0.44         7

    accuracy                           0.98     11749
   macro avg       0.93      0.81      0.84     11749
weighted avg       0.98      0.98      0.98     11749

📊 Evaluation Results:
Accuracy: 0.9814
Precision: 0.9808
Recall: 0.9814
F1 Score: 0.9809


In [None]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data (IE tagging scheme)
sentences, tags = load_excel_data("/content/IE.xlsx")  # ← Path to IE tagging file
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Build and train RidgeClassifier model
vec = DictVectorizer(sparse=True)
clf = RidgeClassifier()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = pipeline.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


🔍 Classification Report:
              precision    recall  f1-score   support

           E       0.88      0.89      0.89       252
          EO       0.96      0.94      0.95       264
           I       0.96      0.95      0.95       293
          IO       1.00      1.00      1.00     10940

    accuracy                           0.99     11749
   macro avg       0.95      0.94      0.95     11749
weighted avg       0.99      0.99      0.99     11749

📊 Evaluation Results:
Accuracy: 0.9914
Precision: 0.9914
Recall: 0.9914
F1 Score: 0.9914


In [None]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data (IOB tagging scheme)
sentences, tags = load_excel_data("/content/IOB.xlsx")  # ← Path to IOB tagging file
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Build and train RidgeClassifier model
vec = DictVectorizer(sparse=True)
clf = RidgeClassifier()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = pipeline.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


🔍 Classification Report:
              precision    recall  f1-score   support

           B       0.97      0.97      0.97       252
           I       0.92      0.91      0.91       296
           O       1.00      1.00      1.00     11201

    accuracy                           0.99     11749
   macro avg       0.96      0.96      0.96     11749
weighted avg       0.99      0.99      0.99     11749

📊 Evaluation Results:
Accuracy: 0.9945
Precision: 0.9944
Recall: 0.9945
F1 Score: 0.9945


In [None]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data (IOBES tagging scheme)
sentences, tags = load_excel_data("/content/IOBES.xlsx")  # ← Path to IOBES tagging file
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Build and train RidgeClassifier model
vec = DictVectorizer(sparse=True)
clf = RidgeClassifier()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = pipeline.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


🔍 Classification Report:
              precision    recall  f1-score   support

           B       0.97      0.97      0.97       247
           E       0.88      0.90      0.89       248
           I       0.92      0.78      0.85        46
           O       1.00      1.00      1.00     11204
           S       1.00      1.00      1.00         4

    accuracy                           0.99     11749
   macro avg       0.95      0.93      0.94     11749
weighted avg       0.99      0.99      0.99     11749

📊 Evaluation Results:
Accuracy: 0.9936
Precision: 0.9936
Recall: 0.9936
F1 Score: 0.9936


In [None]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data (IOE tagging scheme)
sentences, tags = load_excel_data("/content/IOE.xlsx")  # ← Path to IOE tagging file
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Build and train RidgeClassifier model
vec = DictVectorizer(sparse=True)
clf = RidgeClassifier()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = pipeline.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


🔍 Classification Report:
              precision    recall  f1-score   support

           E       0.88      0.90      0.89       252
           I       0.96      0.95      0.95       293
           O       1.00      1.00      1.00     11204

    accuracy                           0.99     11749
   macro avg       0.95      0.95      0.95     11749
weighted avg       0.99      0.99      0.99     11749

📊 Evaluation Results:
Accuracy: 0.9936
Precision: 0.9936
Recall: 0.9936
F1 Score: 0.9936


##SECOND

In [None]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data
sentences, tags = load_excel_data("/content/IO2.xlsx")  # Adjust path if needed
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Build and train RidgeClassifier model
vec = DictVectorizer(sparse=True)
clf = RidgeClassifier()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = pipeline.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


🔍 Classification Report:
              precision    recall  f1-score   support

           I       0.95      0.93      0.94       591
           O       1.00      1.00      1.00     11158

    accuracy                           0.99     11749
   macro avg       0.97      0.96      0.97     11749
weighted avg       0.99      0.99      0.99     11749

📊 Evaluation Results:
Accuracy: 0.9940
Precision: 0.9940
Recall: 0.9940
F1 Score: 0.9940


In [None]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data (BI tagging scheme)
sentences, tags = load_excel_data("/content/BI2.xlsx")  # ← updated path to BI.xlsx
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Build and train RidgeClassifier model
vec = DictVectorizer(sparse=True)
clf = RidgeClassifier()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = pipeline.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


🔍 Classification Report:
              precision    recall  f1-score   support

           B       0.97      0.97      0.97       266
          BO       0.74      0.73      0.73       210
           I       0.93      0.89      0.91       325
          IO       0.99      0.99      0.99     10948

    accuracy                           0.98     11749
   macro avg       0.91      0.90      0.90     11749
weighted avg       0.98      0.98      0.98     11749

📊 Evaluation Results:
Accuracy: 0.9845
Precision: 0.9844
Recall: 0.9845
F1 Score: 0.9844


In [None]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data (BIES tagging scheme)
sentences, tags = load_excel_data("/content/BIES2.xlsx")  # ← Path to BIES tagging file
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Build and train RidgeClassifier model
vec = DictVectorizer(sparse=True)
clf = RidgeClassifier()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = pipeline.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


🔍 Classification Report:
              precision    recall  f1-score   support

           B       0.97      0.97      0.97       262
          BO       0.75      0.70      0.72       202
           E       0.91      0.88      0.90       271
          EO       0.95      0.93      0.94       266
           I       0.94      0.81      0.87        54
          IO       0.99      0.99      0.99     10682
           S       1.00      1.00      1.00         4
          SO       1.00      0.25      0.40         8

    accuracy                           0.98     11749
   macro avg       0.94      0.82      0.85     11749
weighted avg       0.98      0.98      0.98     11749

📊 Evaluation Results:
Accuracy: 0.9815
Precision: 0.9812
Recall: 0.9815
F1 Score: 0.9812


In [None]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data (IE tagging scheme)
sentences, tags = load_excel_data("/content/IE2.xlsx")  # ← Path to IE tagging file
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Build and train RidgeClassifier model
vec = DictVectorizer(sparse=True)
clf = RidgeClassifier()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = pipeline.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


🔍 Classification Report:
              precision    recall  f1-score   support

           E       0.91      0.89      0.90       275
          EO       0.96      0.93      0.94       274
           I       0.97      0.95      0.96       316
          IO       0.99      1.00      1.00     10884

    accuracy                           0.99     11749
   macro avg       0.96      0.94      0.95     11749
weighted avg       0.99      0.99      0.99     11749

📊 Evaluation Results:
Accuracy: 0.9911
Precision: 0.9911
Recall: 0.9911
F1 Score: 0.9911


In [None]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data (IOB tagging scheme)
sentences, tags = load_excel_data("/content/IOB2.xlsx")  # ← Path to IOB tagging file
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Build and train RidgeClassifier model
vec = DictVectorizer(sparse=True)
clf = RidgeClassifier()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = pipeline.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


🔍 Classification Report:
              precision    recall  f1-score   support

           B       0.97      0.97      0.97       266
           I       0.93      0.90      0.91       325
           O       1.00      1.00      1.00     11158

    accuracy                           0.99     11749
   macro avg       0.97      0.96      0.96     11749
weighted avg       0.99      0.99      0.99     11749

📊 Evaluation Results:
Accuracy: 0.9941
Precision: 0.9941
Recall: 0.9941
F1 Score: 0.9941


In [None]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data (IOBES tagging scheme)
sentences, tags = load_excel_data("/content/IOBES2.xlsx")  # ← Path to IOBES tagging file
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Build and train RidgeClassifier model
vec = DictVectorizer(sparse=True)
clf = RidgeClassifier()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = pipeline.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


🔍 Classification Report:
              precision    recall  f1-score   support

           B       0.97      0.97      0.97       262
           E       0.91      0.89      0.90       271
           I       0.94      0.81      0.87        54
           O       1.00      1.00      1.00     11158
           S       1.00      1.00      1.00         4

    accuracy                           0.99     11749
   macro avg       0.96      0.93      0.95     11749
weighted avg       0.99      0.99      0.99     11749

📊 Evaluation Results:
Accuracy: 0.9937
Precision: 0.9936
Recall: 0.9937
F1 Score: 0.9936


In [None]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data (IOE tagging scheme)
sentences, tags = load_excel_data("/content/IOE.xlsx")  # ← Path to IOE tagging file
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Build and train RidgeClassifier model
vec = DictVectorizer(sparse=True)
clf = RidgeClassifier()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = pipeline.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


🔍 Classification Report:
              precision    recall  f1-score   support

           E       0.88      0.90      0.89       252
           I       0.96      0.95      0.95       293
           O       1.00      1.00      1.00     11204

    accuracy                           0.99     11749
   macro avg       0.95      0.95      0.95     11749
weighted avg       0.99      0.99      0.99     11749

📊 Evaluation Results:
Accuracy: 0.9936
Precision: 0.9936
Recall: 0.9936
F1 Score: 0.9936
