<a href="https://colab.research.google.com/github/its3alih/Thesis/blob/main/hyperparametersSVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##FIRST

##SVM

In [None]:
import pandas as pd

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y


In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

sentences, tags = load_excel_data("/content/IO.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=True)),
    ('clf', LinearSVC())
])

param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__max_iter': [1000, 5000],
    'clf__class_weight': [None, 'balanced'],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'clf__C': 1, 'clf__class_weight': None, 'clf__max_iter': 1000}
Classification Report:
              precision    recall  f1-score   support

           I       0.95      0.95      0.95       548
           O       1.00      1.00      1.00     11201

    accuracy                           1.00     11749
   macro avg       0.97      0.97      0.97     11749
weighted avg       1.00      1.00      1.00     11749

Evaluation Results:
Accuracy: 0.9954
Precision: 0.9954
Recall: 0.9954
F1 Score: 0.9954


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

# (Include your existing functions load_excel_data, word2features, prepare_data here)

# Load and prepare data
sentences, tags = load_excel_data("/content/IOB.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Create pipeline
pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=True)),
    ('clf', LinearSVC())
])

# Define hyperparameter grid
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],            # Regularization parameter
    'clf__max_iter': [1000, 5000],                # Number of iterations
    'clf__class_weight': [None, 'balanced'],      # Handle class imbalance
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=2
)

# Fit GridSearch on training data
grid_search.fit(X_train, y_train)

# Print best parameters
print("Best parameters found:", grid_search.best_params_)

# Predict using best estimator
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate results
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'clf__C': 1, 'clf__class_weight': None, 'clf__max_iter': 1000}
Classification Report:
              precision    recall  f1-score   support

           B       0.98      0.98      0.98       252
           I       0.93      0.93      0.93       296
           O       1.00      1.00      1.00     11201

    accuracy                           1.00     11749
   macro avg       0.97      0.97      0.97     11749
weighted avg       1.00      1.00      1.00     11749

Evaluation Results:
Accuracy: 0.9957
Precision: 0.9956
Recall: 0.9957
F1 Score: 0.9957


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

# Your existing functions for IOE tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# Load IOE data
sentences, tags = load_excel_data("/content/IOE.xlsx")

# Prepare data
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Define pipeline
pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=True)),
    ('clf', LinearSVC())
])

# Hyperparameter grid
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__max_iter': [1000, 5000],
    'clf__class_weight': [None, 'balanced'],
}

# GridSearchCV
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=2
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Results
print("Best parameters found:", grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'clf__C': 1, 'clf__class_weight': None, 'clf__max_iter': 1000}
Classification Report:
              precision    recall  f1-score   support

           E       0.91      0.92      0.92       252
           I       0.98      0.97      0.97       293
           O       1.00      1.00      1.00     11204

    accuracy                           1.00     11749
   macro avg       0.96      0.96      0.96     11749
weighted avg       1.00      1.00      1.00     11749

Evaluation Results:
Accuracy: 0.9952
Precision: 0.9953
Recall: 0.9952
F1 Score: 0.9952


In [None]:
# Load IOBES data
sentences, tags = load_excel_data("/content/IOBES.xlsx")

# Prepare data
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Define pipeline
pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=True)),
    ('clf', LinearSVC())
])

# Hyperparameter grid (same as before)
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__max_iter': [1000, 5000],
    'clf__class_weight': [None, 'balanced'],
}

# GridSearchCV
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=2
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Results
print("Best parameters found:", grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'clf__C': 1, 'clf__class_weight': None, 'clf__max_iter': 1000}
Classification Report:
              precision    recall  f1-score   support

           B       0.99      0.98      0.99       247
           E       0.90      0.92      0.91       248
           I       0.95      0.85      0.90        46
           O       1.00      1.00      1.00     11204
           S       1.00      1.00      1.00         4

    accuracy                           1.00     11749
   macro avg       0.97      0.95      0.96     11749
weighted avg       1.00      1.00      1.00     11749

Evaluation Results:
Accuracy: 0.9952
Precision: 0.9953
Recall: 0.9952
F1 Score: 0.9952


In [None]:
# Load IE data
sentences, tags = load_excel_data("/content/IE.xlsx")

# Prepare data
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Define pipeline
pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=True)),
    ('clf', LinearSVC())
])

# Hyperparameter grid
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__max_iter': [1000, 5000],
    'clf__class_weight': [None, 'balanced'],
}

# GridSearchCV
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=2
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Results
print("Best parameters found:", grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'clf__C': 1, 'clf__class_weight': None, 'clf__max_iter': 1000}
Classification Report:
              precision    recall  f1-score   support

           E       0.90      0.92      0.91       252
          EO       0.96      0.93      0.95       264
           I       0.98      0.97      0.97       293
          IO       1.00      1.00      1.00     10940

    accuracy                           0.99     11749
   macro avg       0.96      0.95      0.96     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9928
Precision: 0.9928
Recall: 0.9928
F1 Score: 0.9928


In [None]:
# Load BI data
sentences, tags = load_excel_data("/content/BI.xlsx")

# Prepare data
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Define pipeline
pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=True)),
    ('clf', LinearSVC())
])

# Hyperparameter grid
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__max_iter': [1000, 5000],
    'clf__class_weight': [None, 'balanced'],
}

# GridSearchCV
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=2
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Results
print("Best parameters found:", grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'clf__C': 10, 'clf__class_weight': None, 'clf__max_iter': 1000}
Classification Report:
              precision    recall  f1-score   support

           B       0.98      0.98      0.98       251
          BO       0.76      0.81      0.79       197
           I       0.90      0.93      0.91       294
          IO       0.99      0.99      0.99     11007

    accuracy                           0.99     11749
   macro avg       0.91      0.93      0.92     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9873
Precision: 0.9877
Recall: 0.9873
F1 Score: 0.9875


In [None]:
# Load BIES data
sentences, tags = load_excel_data("/content/BIES.xlsx")

# Prepare data
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Define pipeline
pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=True)),
    ('clf', LinearSVC())
])

# Hyperparameter grid
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__max_iter': [1000, 5000],
    'clf__class_weight': [None, 'balanced'],
}

# GridSearchCV
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=2
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Results
print("Best parameters found:", grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'clf__C': 1, 'clf__class_weight': None, 'clf__max_iter': 1000}
Classification Report:
              precision    recall  f1-score   support

           B       0.98      0.98      0.98       247
          BO       0.77      0.79      0.78       190
           E       0.89      0.92      0.91       248
          EO       0.96      0.93      0.94       257
           I       0.95      0.85      0.90        46
          IO       0.99      0.99      0.99     10750
           S       1.00      1.00      1.00         4
          SO       0.67      0.57      0.62         7

    accuracy                           0.99     11749
   macro avg       0.90      0.88      0.89     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9851
Precision: 0.9852
Recall: 0.9851
F1 Score: 0.9851


##SECOND

In [None]:
import pandas as pd

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y


In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

sentences, tags = load_excel_data("/content/IO2.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=True)),
    ('clf', LinearSVC())
])

param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__max_iter': [1000, 5000],
    'clf__class_weight': [None, 'balanced'],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'clf__C': 1, 'clf__class_weight': None, 'clf__max_iter': 1000}
Classification Report:
              precision    recall  f1-score   support

           I       0.95      0.95      0.95       591
           O       1.00      1.00      1.00     11158

    accuracy                           0.99     11749
   macro avg       0.97      0.97      0.97     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9950
Precision: 0.9950
Recall: 0.9950
F1 Score: 0.9950


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

# (Include your existing functions load_excel_data, word2features, prepare_data here)

# Load and prepare data
sentences, tags = load_excel_data("/content/IOB2.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Create pipeline
pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=True)),
    ('clf', LinearSVC())
])

# Define hyperparameter grid
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],            # Regularization parameter
    'clf__max_iter': [1000, 5000],                # Number of iterations
    'clf__class_weight': [None, 'balanced'],      # Handle class imbalance
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=2
)

# Fit GridSearch on training data
grid_search.fit(X_train, y_train)

# Print best parameters
print("Best parameters found:", grid_search.best_params_)

# Predict using best estimator
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate results
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'clf__C': 10, 'clf__class_weight': None, 'clf__max_iter': 1000}
Classification Report:
              precision    recall  f1-score   support

           B       0.99      0.99      0.99       266
           I       0.91      0.92      0.91       325
           O       1.00      1.00      1.00     11158

    accuracy                           0.99     11749
   macro avg       0.96      0.97      0.97     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9946
Precision: 0.9947
Recall: 0.9946
F1 Score: 0.9947


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

# Your existing functions for IOE tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# Load IOE data
sentences, tags = load_excel_data("/content/IOE2.xlsx")

# Prepare data
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Define pipeline
pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=True)),
    ('clf', LinearSVC())
])

# Hyperparameter grid
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__max_iter': [1000, 5000],
    'clf__class_weight': [None, 'balanced'],
}

# GridSearchCV
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=2
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Results
print("Best parameters found:", grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'clf__C': 10, 'clf__class_weight': None, 'clf__max_iter': 1000}
Classification Report:
              precision    recall  f1-score   support

           E       0.93      0.92      0.92       275
           I       0.96      0.97      0.97       316
           O       1.00      1.00      1.00     11158

    accuracy                           0.99     11749
   macro avg       0.96      0.96      0.96     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9946
Precision: 0.9946
Recall: 0.9946
F1 Score: 0.9946


In [None]:
# Load IOBES data
sentences, tags = load_excel_data("/content/IOBES2.xlsx")

# Prepare data
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Define pipeline
pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=True)),
    ('clf', LinearSVC())
])

# Hyperparameter grid (same as before)
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__max_iter': [1000, 5000],
    'clf__class_weight': [None, 'balanced'],
}

# GridSearchCV
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=2
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Results
print("Best parameters found:", grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'clf__C': 1, 'clf__class_weight': None, 'clf__max_iter': 1000}
Classification Report:
              precision    recall  f1-score   support

           B       0.99      0.99      0.99       262
           E       0.93      0.92      0.92       271
           I       0.86      0.93      0.89        54
           O       1.00      1.00      1.00     11158
           S       1.00      1.00      1.00         4

    accuracy                           1.00     11749
   macro avg       0.96      0.97      0.96     11749
weighted avg       1.00      1.00      1.00     11749

Evaluation Results:
Accuracy: 0.9951
Precision: 0.9952
Recall: 0.9951
F1 Score: 0.9952


In [None]:
# Load IE data
sentences, tags = load_excel_data("/content/IE2.xlsx")

# Prepare data
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Define pipeline
pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=True)),
    ('clf', LinearSVC())
])

# Hyperparameter grid
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__max_iter': [1000, 5000],
    'clf__class_weight': [None, 'balanced'],
}

# GridSearchCV
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=2
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Results
print("Best parameters found:", grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'clf__C': 1, 'clf__class_weight': None, 'clf__max_iter': 1000}
Classification Report:
              precision    recall  f1-score   support

           E       0.93      0.91      0.92       275
          EO       0.97      0.92      0.94       274
           I       0.97      0.98      0.98       316
          IO       1.00      1.00      1.00     10884

    accuracy                           0.99     11749
   macro avg       0.97      0.95      0.96     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9928
Precision: 0.9927
Recall: 0.9928
F1 Score: 0.9927


In [None]:
# Load BI data
sentences, tags = load_excel_data("/content/BI2.xlsx")

# Prepare data
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Define pipeline
pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=True)),
    ('clf', LinearSVC())
])

# Hyperparameter grid
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__max_iter': [1000, 5000],
    'clf__class_weight': [None, 'balanced'],
}

# GridSearchCV
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=2
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Results
print("Best parameters found:", grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'clf__C': 10, 'clf__class_weight': None, 'clf__max_iter': 1000}
Classification Report:
              precision    recall  f1-score   support

           B       0.99      0.98      0.99       266
          BO       0.79      0.81      0.80       210
           I       0.92      0.92      0.92       325
          IO       0.99      0.99      0.99     10948

    accuracy                           0.99     11749
   macro avg       0.92      0.93      0.93     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9878
Precision: 0.9879
Recall: 0.9878
F1 Score: 0.9879


In [None]:
# Load BIES data
sentences, tags = load_excel_data("/content/BIES2.xlsx")

# Prepare data
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Define pipeline
pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=True)),
    ('clf', LinearSVC())
])

# Hyperparameter grid
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__max_iter': [1000, 5000],
    'clf__class_weight': [None, 'balanced'],
}

# GridSearchCV
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=2
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Results
print("Best parameters found:", grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'clf__C': 1, 'clf__class_weight': None, 'clf__max_iter': 1000}
Classification Report:
              precision    recall  f1-score   support

           B       0.99      0.99      0.99       262
          BO       0.78      0.82      0.80       202
           E       0.93      0.92      0.92       271
          EO       0.96      0.93      0.94       266
           I       0.86      0.93      0.89        54
          IO       0.99      0.99      0.99     10682
           S       1.00      1.00      1.00         4
          SO       0.67      0.50      0.57         8

    accuracy                           0.99     11749
   macro avg       0.90      0.88      0.89     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9854
Precision: 0.9856
Recall: 0.9854
F1 Score: 0.9855
