<a href="https://colab.research.google.com/github/its3alih/Thesis/blob/main/SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##FIRST


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data
sentences, tags = load_excel_data("/content/IO.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Build and train SVM model
vec = DictVectorizer(sparse=True)
clf = LinearSVC()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = pipeline.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           I       0.95      0.95      0.95       548
           O       1.00      1.00      1.00     11201

    accuracy                           1.00     11749
   macro avg       0.97      0.97      0.97     11749
weighted avg       1.00      1.00      1.00     11749

Evaluation Results:
Accuracy: 0.9954
Precision: 0.9954
Recall: 0.9954
F1 Score: 0.9954


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data with IOB Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract features per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare token-level data
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data
sentences, tags = load_excel_data("/content/IOB.xlsx")  # Make sure this is the IOB-tagged file
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Vectorize and Train SVM
vec = DictVectorizer(sparse=True)
clf = LinearSVC()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and Evaluate
y_pred = pipeline.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       0.98      0.98      0.98       252
           I       0.93      0.93      0.93       296
           O       1.00      1.00      1.00     11201

    accuracy                           1.00     11749
   macro avg       0.97      0.97      0.97     11749
weighted avg       1.00      1.00      1.00     11749

Evaluation Results:
Accuracy: 0.9957
Precision: 0.9956
Recall: 0.9957
F1 Score: 0.9957


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data with IOE Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract token-level features
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare data for SVM
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load IOE-tagged data
sentences, tags = load_excel_data("/content/IOE.xlsx")  # Make sure this file uses IOE tags

# 5. Extract features and labels
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 6. Train SVM
vec = DictVectorizer(sparse=True)
clf = LinearSVC()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 7. Predict and Evaluate
y_pred = pipeline.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           E       0.91      0.92      0.92       252
           I       0.98      0.97      0.97       293
           O       1.00      1.00      1.00     11204

    accuracy                           1.00     11749
   macro avg       0.96      0.96      0.96     11749
weighted avg       1.00      1.00      1.00     11749

Evaluation Results:
Accuracy: 0.9952
Precision: 0.9953
Recall: 0.9952
F1 Score: 0.9952


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data with IOBES Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract token-level features
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare data for SVM
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load IOBES-tagged data
sentences, tags = load_excel_data("/content/IOBES.xlsx")  # Make sure this file uses IOBES tags

# 5. Extract features and labels
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 6. Train SVM
vec = DictVectorizer(sparse=True)
clf = LinearSVC()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 7. Predict and Evaluate
y_pred = pipeline.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       0.99      0.98      0.99       247
           E       0.90      0.92      0.91       248
           I       0.95      0.85      0.90        46
           O       1.00      1.00      1.00     11204
           S       1.00      1.00      1.00         4

    accuracy                           1.00     11749
   macro avg       0.97      0.95      0.96     11749
weighted avg       1.00      1.00      1.00     11749

Evaluation Results:
Accuracy: 0.9952
Precision: 0.9953
Recall: 0.9952
F1 Score: 0.9952


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data with IE Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence end
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract token-level features
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare data
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load IE-tagged data
sentences, tags = load_excel_data("/content/IE.xlsx")  # Make sure this file uses IE tagging scheme

# 5. Extract features and labels
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 6. Train SVM
vec = DictVectorizer(sparse=True)
clf = LinearSVC()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 7. Predict and Evaluate
y_pred = pipeline.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           E       0.90      0.92      0.91       252
          EO       0.96      0.93      0.95       264
           I       0.98      0.97      0.97       293
          IO       1.00      1.00      1.00     10940

    accuracy                           0.99     11749
   macro avg       0.96      0.95      0.96     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9928
Precision: 0.9928
Recall: 0.9928
F1 Score: 0.9928


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data with BI Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence end
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature Extraction for Each Token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare Features and Labels
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load BI-tagged data
sentences, tags = load_excel_data("/content/BI.xlsx")  # Ensure this is BI tagged

# 5. Extract Features and Labels
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 6. Train SVM Model
vec = DictVectorizer(sparse=True)
clf = LinearSVC()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 7. Predict and Evaluate
y_pred = pipeline.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       0.98      0.98      0.98       251
          BO       0.76      0.80      0.78       197
           I       0.92      0.93      0.93       294
          IO       0.99      0.99      0.99     11007

    accuracy                           0.99     11749
   macro avg       0.91      0.92      0.92     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9878
Precision: 0.9880
Recall: 0.9878
F1 Score: 0.9879


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel with BIES Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence end
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract Features for Each Token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Format Features and Labels
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load BIES Data
sentences, tags = load_excel_data("/content/BIES.xlsx")  # Update with your BIES-labeled Excel path

# 5. Prepare Feature Vectors and Labels
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 6. Train SVM Model
vec = DictVectorizer(sparse=True)
clf = LinearSVC()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 7. Evaluate
y_pred = pipeline.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       0.98      0.98      0.98       247
          BO       0.77      0.79      0.78       190
           E       0.89      0.92      0.91       248
          EO       0.96      0.93      0.94       257
           I       0.95      0.85      0.90        46
          IO       0.99      0.99      0.99     10750
           S       1.00      1.00      1.00         4
          SO       0.67      0.57      0.62         7

    accuracy                           0.99     11749
   macro avg       0.90      0.88      0.89     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9851
Precision: 0.9852
Recall: 0.9851
F1 Score: 0.9851


##SECOND



In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data
sentences, tags = load_excel_data("/content/IO.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Build and train SVM model
vec = DictVectorizer(sparse=True)
clf = LinearSVC()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = pipeline.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           I       0.95      0.95      0.95       591
           O       1.00      1.00      1.00     11158

    accuracy                           0.99     11749
   macro avg       0.97      0.97      0.97     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9950
Precision: 0.9950
Recall: 0.9950
F1 Score: 0.9950


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data with IOB Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract features per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare token-level data
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data
sentences, tags = load_excel_data("/content/IOB.xlsx")  # Make sure this is the IOB-tagged file
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Vectorize and Train SVM
vec = DictVectorizer(sparse=True)
clf = LinearSVC()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 6. Predict and Evaluate
y_pred = pipeline.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       0.99      0.99      0.99       266
           I       0.93      0.92      0.93       325
           O       1.00      1.00      1.00     11158

    accuracy                           1.00     11749
   macro avg       0.97      0.97      0.97     11749
weighted avg       1.00      1.00      1.00     11749

Evaluation Results:
Accuracy: 0.9956
Precision: 0.9956
Recall: 0.9956
F1 Score: 0.9956


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data with IOE Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract token-level features
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare data for SVM
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load IOE-tagged data
sentences, tags = load_excel_data("/content/IOE.xlsx")  # Make sure this file uses IOE tags

# 5. Extract features and labels
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 6. Train SVM
vec = DictVectorizer(sparse=True)
clf = LinearSVC()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 7. Predict and Evaluate
y_pred = pipeline.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           E       0.93      0.92      0.92       275
           I       0.97      0.98      0.98       316
           O       1.00      1.00      1.00     11158

    accuracy                           1.00     11749
   macro avg       0.97      0.96      0.97     11749
weighted avg       1.00      1.00      1.00     11749

Evaluation Results:
Accuracy: 0.9952
Precision: 0.9952
Recall: 0.9952
F1 Score: 0.9952


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data with IOBES Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract token-level features
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare data for SVM
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load IOBES-tagged data
sentences, tags = load_excel_data("/content/IOBES.xlsx")  # Make sure this file uses IOBES tags

# 5. Extract features and labels
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 6. Train SVM
vec = DictVectorizer(sparse=True)
clf = LinearSVC()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 7. Predict and Evaluate
y_pred = pipeline.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       0.99      0.99      0.99       262
           E       0.93      0.92      0.92       271
           I       0.86      0.93      0.89        54
           O       1.00      1.00      1.00     11158
           S       1.00      1.00      1.00         4

    accuracy                           1.00     11749
   macro avg       0.96      0.97      0.96     11749
weighted avg       1.00      1.00      1.00     11749

Evaluation Results:
Accuracy: 0.9951
Precision: 0.9952
Recall: 0.9951
F1 Score: 0.9952


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data with IE Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence end
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract token-level features
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare data
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load IE-tagged data
sentences, tags = load_excel_data("/content/IE.xlsx")  # Make sure this file uses IE tagging scheme

# 5. Extract features and labels
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 6. Train SVM
vec = DictVectorizer(sparse=True)
clf = LinearSVC()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 7. Predict and Evaluate
y_pred = pipeline.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           E       0.93      0.91      0.92       275
          EO       0.97      0.92      0.94       274
           I       0.97      0.98      0.98       316
          IO       1.00      1.00      1.00     10884

    accuracy                           0.99     11749
   macro avg       0.97      0.95      0.96     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9928
Precision: 0.9927
Recall: 0.9928
F1 Score: 0.9927


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel Data with BI Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence end
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature Extraction for Each Token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare Features and Labels
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load BI-tagged data
sentences, tags = load_excel_data("/content/BI.xlsx")  # Ensure this is BI tagged

# 5. Extract Features and Labels
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 6. Train SVM Model
vec = DictVectorizer(sparse=True)
clf = LinearSVC()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 7. Predict and Evaluate
y_pred = pipeline.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       0.99      0.99      0.99       266
          BO       0.77      0.82      0.79       210
           I       0.93      0.92      0.93       325
          IO       0.99      0.99      0.99     10948

    accuracy                           0.99     11749
   macro avg       0.92      0.93      0.93     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9879
Precision: 0.9881
Recall: 0.9879
F1 Score: 0.9880


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel with BIES Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence end
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract Features for Each Token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Format Features and Labels
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load BIES Data
sentences, tags = load_excel_data("/content/BIES.xlsx")  # Update with your BIES-labeled Excel path

# 5. Prepare Feature Vectors and Labels
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 6. Train SVM Model
vec = DictVectorizer(sparse=True)
clf = LinearSVC()
pipeline = make_pipeline(vec, clf)
pipeline.fit(X_train, y_train)

# 7. Evaluate
y_pred = pipeline.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       0.99      0.99      0.99       262
          BO       0.78      0.82      0.80       202
           E       0.93      0.92      0.92       271
          EO       0.96      0.93      0.94       266
           I       0.86      0.93      0.89        54
          IO       0.99      0.99      0.99     10682
           S       1.00      1.00      1.00         4
          SO       0.67      0.50      0.57         8

    accuracy                           0.99     11749
   macro avg       0.90      0.88      0.89     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9854
Precision: 0.9856
Recall: 0.9854
F1 Score: 0.9855
