<a href="https://colab.research.google.com/github/its3alih/Thesis/blob/main/CRF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##FIRST

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# 1. Load Excel Data (same)
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction for a word
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Convert sentence to features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# 4. Load, split and prepare data
sentences, tags = load_excel_data("/content/IO.xlsx")
X = [sent2features(s) for s in sentences]
y = tags

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,  # L1 regularization
    c2=0.1,  # L2 regularization
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = crf.predict(X_test)

print("Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {metrics.flat_accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {metrics.flat_precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {metrics.flat_recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {metrics.flat_f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           I       0.99      0.94      0.96       595
           O       1.00      1.00      1.00     11167

    accuracy                           1.00     11762
   macro avg       0.99      0.97      0.98     11762
weighted avg       1.00      1.00      1.00     11762

Evaluation Results:
Accuracy: 0.9964
Precision: 0.9964
Recall: 0.9964
F1 Score: 0.9964


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# 1. Load Excel Data with IE tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction for CRF
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# 3. Load and prepare IE dataset
sentences, tags = load_excel_data("/content/IE.xlsx")  # Replace with your actual path
X = [sent2features(s) for s in sentences]
y = tags

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = crf.predict(X_test)

print("Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {metrics.flat_accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {metrics.flat_precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {metrics.flat_recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {metrics.flat_f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           E       0.96      0.94      0.95       264
          EO       0.98      0.97      0.98       256
           I       0.98      0.94      0.96       327
          IO       1.00      1.00      1.00     10915

    accuracy                           1.00     11762
   macro avg       0.98      0.96      0.97     11762
weighted avg       1.00      1.00      1.00     11762

Evaluation Results:
Accuracy: 0.9953
Precision: 0.9953
Recall: 0.9953
F1 Score: 0.9953


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# 1. Load Excel Data with IOB tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction for CRF
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# 3. Load and prepare IOB dataset
sentences, tags = load_excel_data("/content/IOB.xlsx")  # Replace with your file path
X = [sent2features(s) for s in sentences]
y = tags

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = crf.predict(X_test)

print("Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {metrics.flat_accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {metrics.flat_precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {metrics.flat_recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {metrics.flat_f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       1.00      0.97      0.98       266
           I       1.00      0.93      0.96       329
           O       1.00      1.00      1.00     11167

    accuracy                           1.00     11762
   macro avg       1.00      0.96      0.98     11762
weighted avg       1.00      1.00      1.00     11762

Evaluation Results:
Accuracy: 0.9970
Precision: 0.9970
Recall: 0.9970
F1 Score: 0.9970


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# 1. Load Excel Data with IOBES tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction for CRF
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# 3. Load and prepare IOBES dataset
sentences, tags = load_excel_data("/content/IOBES.xlsx")  # Replace with your actual path
X = [sent2features(s) for s in sentences]
y = tags

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = crf.predict(X_test)

print("Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {metrics.flat_accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {metrics.flat_precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {metrics.flat_recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {metrics.flat_f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       0.99      0.97      0.98       261
           E       0.96      0.94      0.95       261
           I       1.00      0.82      0.90        66
           O       1.00      1.00      1.00     11171
           S       1.00      0.67      0.80         3

    accuracy                           1.00     11762
   macro avg       0.99      0.88      0.93     11762
weighted avg       1.00      1.00      1.00     11762

Evaluation Results:
Accuracy: 0.9963
Precision: 0.9963
Recall: 0.9963
F1 Score: 0.9963


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# 1. Load Excel Data with IOE tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction for CRF
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# 3. Load and prepare IOE dataset
sentences, tags = load_excel_data("/content/IOE.xlsx")  # Replace with your actual file path
X = [sent2features(s) for s in sentences]
y = tags

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = crf.predict(X_test)

print("Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {metrics.flat_accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {metrics.flat_precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {metrics.flat_recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {metrics.flat_f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           E       0.96      0.94      0.95       264
           I       0.99      0.93      0.96       327
           O       1.00      1.00      1.00     11171

    accuracy                           1.00     11762
   macro avg       0.98      0.96      0.97     11762
weighted avg       1.00      1.00      1.00     11762

Evaluation Results:
Accuracy: 0.9960
Precision: 0.9960
Recall: 0.9960
F1 Score: 0.9960


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# 1. Load Excel Data with BI tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction for CRF
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# 3. Load and prepare BI dataset
sentences, tags = load_excel_data("/content/BI.xlsx")  # Change to your BI dataset path
X = [sent2features(s) for s in sentences]
y = tags

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = crf.predict(X_test)

print("Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {metrics.flat_accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {metrics.flat_precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {metrics.flat_recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {metrics.flat_f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       0.99      0.97      0.98       264
          BO       0.95      0.94      0.94       201
           I       0.99      0.93      0.96       327
          IO       1.00      1.00      1.00     10970

    accuracy                           1.00     11762
   macro avg       0.98      0.96      0.97     11762
weighted avg       1.00      1.00      1.00     11762

Evaluation Results:
Accuracy: 0.9954
Precision: 0.9954
Recall: 0.9954
F1 Score: 0.9954


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# 1. Load Excel Data with BIES tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction for CRF
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# 3. Load and prepare BIES dataset
sentences, tags = load_excel_data("/content/BIES.xlsx")  # Change to your BIES dataset path
X = [sent2features(s) for s in sentences]
y = tags

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = crf.predict(X_test)

print("Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {metrics.flat_accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {metrics.flat_precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {metrics.flat_recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {metrics.flat_f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       0.99      0.96      0.98       261
          BO       0.95      0.94      0.94       190
           E       0.97      0.94      0.96       261
          EO       0.98      0.96      0.97       244
           I       1.00      0.83      0.91        66
          IO       1.00      1.00      1.00     10726
           S       1.00      0.67      0.80         3
          SO       1.00      0.73      0.84        11

    accuracy                           0.99     11762
   macro avg       0.99      0.88      0.92     11762
weighted avg       0.99      0.99      0.99     11762

Evaluation Results:
Accuracy: 0.9941
Precision: 0.9941
Recall: 0.9941
F1 Score: 0.9940


##SECOND
*italicised text*

In [None]:
!pip install sklearn-crfsuite


Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn-crfsuite-0.5.0


In [None]:
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# 1. Load Excel Data (same)
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction for a word
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Convert sentence to features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# 4. Load, split and prepare data
sentences, tags = load_excel_data("/content/IO.xlsx")
X = [sent2features(s) for s in sentences]
y = tags

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,  # L1 regularization
    c2=0.1,  # L2 regularization
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = crf.predict(X_test)

print("Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {metrics.flat_accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {metrics.flat_precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {metrics.flat_recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {metrics.flat_f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           I       0.99      0.96      0.97       624
           O       1.00      1.00      1.00     11138

    accuracy                           1.00     11762
   macro avg       0.99      0.98      0.99     11762
weighted avg       1.00      1.00      1.00     11762

Evaluation Results:
Accuracy: 0.9971
Precision: 0.9971
Recall: 0.9971
F1 Score: 0.9971


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# 1. Load Excel Data with IE tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction for CRF
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# 3. Load and prepare IE dataset
sentences, tags = load_excel_data("/content/IE.xlsx")  # Replace with your actual path
X = [sent2features(s) for s in sentences]
y = tags

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = crf.predict(X_test)

print("Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {metrics.flat_accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {metrics.flat_precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {metrics.flat_recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {metrics.flat_f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           E       0.99      0.93      0.96       283
          EO       1.00      0.94      0.97       275
           I       0.99      0.93      0.96       341
          IO       0.99      1.00      1.00     10863

    accuracy                           0.99     11762
   macro avg       0.99      0.95      0.97     11762
weighted avg       0.99      0.99      0.99     11762

Evaluation Results:
Accuracy: 0.9946
Precision: 0.9946
Recall: 0.9946
F1 Score: 0.9946


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# 1. Load Excel Data with IOB tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction for CRF
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# 3. Load and prepare IOB dataset
sentences, tags = load_excel_data("/content/IOB.xlsx")  # Replace with your file path
X = [sent2features(s) for s in sentences]
y = tags

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = crf.predict(X_test)

print("Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {metrics.flat_accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {metrics.flat_precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {metrics.flat_recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {metrics.flat_f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       1.00      0.95      0.97       283
           I       1.00      0.94      0.97       341
           O       1.00      1.00      1.00     11138

    accuracy                           1.00     11762
   macro avg       1.00      0.97      0.98     11762
weighted avg       1.00      1.00      1.00     11762

Evaluation Results:
Accuracy: 0.9971
Precision: 0.9971
Recall: 0.9971
F1 Score: 0.9971


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# 1. Load Excel Data with IOBES tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction for CRF
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# 3. Load and prepare IOBES dataset
sentences, tags = load_excel_data("/content/IOBES.xlsx")  # Replace with your actual path
X = [sent2features(s) for s in sentences]
y = tags

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = crf.predict(X_test)

print("Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {metrics.flat_accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {metrics.flat_precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {metrics.flat_recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {metrics.flat_f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       1.00      0.96      0.98       278
           E       0.99      0.95      0.97       278
           I       1.00      0.87      0.93        63
           O       1.00      1.00      1.00     11138
           S       1.00      0.60      0.75         5

    accuracy                           1.00     11762
   macro avg       1.00      0.88      0.93     11762
weighted avg       1.00      1.00      1.00     11762

Evaluation Results:
Accuracy: 0.9968
Precision: 0.9968
Recall: 0.9968
F1 Score: 0.9967


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# 1. Load Excel Data with IOE tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction for CRF
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# 3. Load and prepare IOE dataset
sentences, tags = load_excel_data("/content/IOE.xlsx")  # Replace with your actual file path
X = [sent2features(s) for s in sentences]
y = tags

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = crf.predict(X_test)

print("Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {metrics.flat_accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {metrics.flat_precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {metrics.flat_recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {metrics.flat_f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           E       0.99      0.95      0.97       283
           I       1.00      0.94      0.97       341
           O       1.00      1.00      1.00     11138

    accuracy                           1.00     11762
   macro avg       0.99      0.96      0.98     11762
weighted avg       1.00      1.00      1.00     11762

Evaluation Results:
Accuracy: 0.9969
Precision: 0.9969
Recall: 0.9969
F1 Score: 0.9969


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# 1. Load Excel Data with BI tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction for CRF
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# 3. Load and prepare BI dataset
sentences, tags = load_excel_data("/content/BI.xlsx")  # Change to your BI dataset path
X = [sent2features(s) for s in sentences]
y = tags

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = crf.predict(X_test)

print("Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {metrics.flat_accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {metrics.flat_precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {metrics.flat_recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {metrics.flat_f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       1.00      0.94      0.97       283
          BO       0.99      0.92      0.95       220
           I       1.00      0.93      0.96       341
          IO       0.99      1.00      1.00     10918

    accuracy                           0.99     11762
   macro avg       0.99      0.95      0.97     11762
weighted avg       0.99      0.99      0.99     11762

Evaluation Results:
Accuracy: 0.9948
Precision: 0.9948
Recall: 0.9948
F1 Score: 0.9947


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# 1. Load Excel Data with BIES tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction for CRF
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# 3. Load and prepare BIES dataset
sentences, tags = load_excel_data("/content/BIES.xlsx")  # Change to your BIES dataset path
X = [sent2features(s) for s in sentences]
y = tags

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = crf.predict(X_test)

print("Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {metrics.flat_accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {metrics.flat_precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {metrics.flat_recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {metrics.flat_f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       1.00      0.93      0.96       278
          BO       0.97      0.92      0.94       205
           E       0.98      0.92      0.95       278
          EO       0.99      0.95      0.97       259
           I       1.00      0.83      0.90        63
          IO       0.99      1.00      1.00     10659
           S       1.00      0.60      0.75         5
          SO       1.00      0.53      0.70        15

    accuracy                           0.99     11762
   macro avg       0.99      0.83      0.90     11762
weighted avg       0.99      0.99      0.99     11762

Evaluation Results:
Accuracy: 0.9919
Precision: 0.9919
Recall: 0.9919
F1 Score: 0.9917


##SECOND