<a href="https://colab.research.google.com/github/its3alih/Thesis/blob/main/EnsembleSVMandDT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##FIRST

In [None]:
# Ensemble of SVM and Decision Tree using VotingClassifier
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
import numpy as np

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data
sentences, tags = load_excel_data("/content/IO.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Vectorize features
vec = DictVectorizer(sparse=True)
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# 6. Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# 7. Define base classifiers
svm = LinearSVC(random_state=42)
dt = DecisionTreeClassifier(random_state=42)

# 8. Ensemble via hard voting
ensemble = VotingClassifier(estimators=[
    ('svm', svm),
    ('dt', dt)
], voting='hard')

# 9. Fit and predict
ensemble.fit(X_train_vec, y_train_enc)
y_pred_enc = ensemble.predict(X_test_vec)
y_pred = le.inverse_transform(y_pred_enc)

# 10. Evaluate
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           I       0.93      0.96      0.95       548
           O       1.00      1.00      1.00     11201

    accuracy                           1.00     11749
   macro avg       0.97      0.98      0.97     11749
weighted avg       1.00      1.00      1.00     11749

Evaluation Results:
Accuracy: 0.9951
Precision: 0.9952
Recall: 0.9951
F1 Score: 0.9951


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder

# 1. Load Excel Data with IOB Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract features per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare token-level data
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# Load data
sentences, tags = load_excel_data("/content/IOB.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Encode labels (VotingClassifier doesn't support string labels directly)
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

# Vectorizer
vec = DictVectorizer(sparse=True)
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# Base classifiers
svm = LinearSVC()
tree = DecisionTreeClassifier(random_state=42)

# Voting Classifier (Hard voting)
ensemble = VotingClassifier(estimators=[
    ('svm', svm),
    ('tree', tree)
], voting='hard')

# Train
ensemble.fit(X_train_vec, y_train_enc)

# Predict
y_pred_enc = ensemble.predict(X_test_vec)
y_pred = label_encoder.inverse_transform(y_pred_enc)

# Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       0.98      0.98      0.98       252
           I       0.90      0.94      0.92       296
           O       1.00      1.00      1.00     11201

    accuracy                           1.00     11749
   macro avg       0.96      0.97      0.97     11749
weighted avg       1.00      1.00      1.00     11749

Evaluation Results:
Accuracy: 0.9952
Precision: 0.9953
Recall: 0.9952
F1 Score: 0.9953


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load IOE-tagged data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract features
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare flat X and y
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# Load and prepare
sentences, tags = load_excel_data("/content/IOE.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Vectorize features
vec = DictVectorizer(sparse=True)
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# Define base models
svm = LinearSVC()
tree = DecisionTreeClassifier(random_state=42)

# Ensemble with hard voting
ensemble = VotingClassifier(
    estimators=[
        ('svm', svm),
        ('tree', tree)
    ],
    voting='hard'  # You can use 'soft' if classifiers support predict_proba
)

# Train ensemble
ensemble.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = ensemble.predict(X_test_vec)

print("Classification Report (Ensemble SVM + Decision Tree):")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report (Ensemble SVM + Decision Tree):
              precision    recall  f1-score   support

           E       0.88      0.94      0.91       252
           I       0.96      0.97      0.96       293
           O       1.00      1.00      1.00     11204

    accuracy                           0.99     11749
   macro avg       0.95      0.97      0.96     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9946
Precision: 0.9948
Recall: 0.9946
F1 Score: 0.9947


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from collections import Counter

# 1. Load IOBES-tagged Excel data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())
    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract features per word
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare data for ML
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load data
sentences, tags = load_excel_data("/content/IOBES.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Initialize vectorizer
vec = DictVectorizer(sparse=True)
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# 6. Train classifiers
svm_clf = LinearSVC()
svm_clf.fit(X_train_vec, y_train)
svm_preds = svm_clf.predict(X_test_vec)

tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train_vec, y_train)
tree_preds = tree_clf.predict(X_test_vec)

# 7. Voting ensemble (hard voting)
final_preds = []
for s_pred, t_pred in zip(svm_preds, tree_preds):
    vote = Counter([s_pred, t_pred])
    final_preds.append(vote.most_common(1)[0][0])  # majority vote

# 8. Evaluation
print("=== Ensemble Classification Report ===")
print(classification_report(y_test, final_preds))
print("Accuracy:", accuracy_score(y_test, final_preds))
print("Precision:", precision_score(y_test, final_preds, average='weighted'))
print("Recall:", recall_score(y_test, final_preds, average='weighted'))
print("F1 Score:", f1_score(y_test, final_preds, average='weighted'))


=== Ensemble Classification Report ===
              precision    recall  f1-score   support

           B       0.99      0.98      0.99       247
           E       0.90      0.92      0.91       248
           I       0.95      0.85      0.90        46
           O       1.00      1.00      1.00     11204
           S       1.00      1.00      1.00         4

    accuracy                           1.00     11749
   macro avg       0.97      0.95      0.96     11749
weighted avg       1.00      1.00      1.00     11749

Accuracy: 0.9952336369052686
Precision: 0.9952621034658378
Recall: 0.9952336369052686
F1 Score: 0.9952334009810228


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

# 1. Load Excel Data with IE Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence end
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract token-level features
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare data
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and prepare dataset
sentences, tags = load_excel_data("/content/IE.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Create a DictVectorizer and individual classifiers
vec = DictVectorizer(sparse=True)

svc = LinearSVC()
tree = DecisionTreeClassifier(random_state=42)

# 6. Voting ensemble (hard voting)
from sklearn.ensemble import VotingClassifier

# Wrap the classifiers in their own pipelines
svc_pipeline = Pipeline([("vectorizer", vec), ("svc", svc)])
tree_pipeline = Pipeline([("vectorizer", vec), ("tree", tree)])

# Note: VotingClassifier requires estimators, not pipelines.
# We ensemble on transformed vectors (i.e., after DictVectorizer).

# Transform features
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# Fit VotingClassifier directly on transformed vectors
voting_clf = VotingClassifier(estimators=[
    ("svc", svc),
    ("tree", tree)
], voting='hard')

voting_clf.fit(X_train_vec, y_train)

# 7. Predict and evaluate
y_pred = voting_clf.predict(X_test_vec)

print("Classification Report (Ensemble):")
print(classification_report(y_test, y_pred))

print("Evaluation Results (Ensemble):")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report (Ensemble):
              precision    recall  f1-score   support

           E       0.88      0.94      0.91       252
          EO       0.94      0.95      0.95       264
           I       0.96      0.97      0.96       293
          IO       1.00      1.00      1.00     10940

    accuracy                           0.99     11749
   macro avg       0.95      0.96      0.95     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results (Ensemble):
Accuracy: 0.9923
Precision: 0.9924
Recall: 0.9923
F1 Score: 0.9923


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from collections import Counter

# Your existing functions for loading data and feature extraction here (load_excel_data, word2features, prepare_data)...

# Load BI-tagged data
sentences, tags = load_excel_data("/content/BI.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Train SVM pipeline
vec = DictVectorizer(sparse=True)
svm_clf = LinearSVC()
svm_pipeline = make_pipeline(vec, svm_clf)
svm_pipeline.fit(X_train, y_train)

# Train Decision Tree pipeline
dt_clf = DecisionTreeClassifier(random_state=42)
dt_pipeline = make_pipeline(vec, dt_clf)
dt_pipeline.fit(X_train, y_train)

# Get predictions from both models
svm_preds = svm_pipeline.predict(X_test)
dt_preds = dt_pipeline.predict(X_test)

# Ensemble by hard voting (majority vote)
ensemble_preds = []
for p1, p2 in zip(svm_preds, dt_preds):
    votes = [p1, p2]
    most_common_label = Counter(votes).most_common(1)[0][0]
    ensemble_preds.append(most_common_label)

# Evaluation
print("Classification Report for Ensemble:")
print(classification_report(y_test, ensemble_preds))

print("Ensemble Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, ensemble_preds):.4f}")
print(f"Precision: {precision_score(y_test, ensemble_preds, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, ensemble_preds, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, ensemble_preds, average='weighted'):.4f}")


Classification Report for Ensemble:
              precision    recall  f1-score   support

           B       0.98      0.98      0.98       251
          BO       0.76      0.80      0.78       197
           I       0.92      0.93      0.93       294
          IO       0.99      0.99      0.99     11007

    accuracy                           0.99     11749
   macro avg       0.91      0.92      0.92     11749
weighted avg       0.99      0.99      0.99     11749

Ensemble Evaluation Results:
Accuracy: 0.9878
Precision: 0.9880
Recall: 0.9878
F1 Score: 0.9879


In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel with BIES Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence end
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract Features for Each Token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Format Features and Labels
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Majority vote for string labels
def majority_vote(preds_array):
    """
    preds_array: numpy array of shape (n_models, n_samples) with string labels
    returns: numpy array of shape (n_samples,) with majority vote per sample
    """
    n_samples = preds_array.shape[1]
    ensemble_preds = []
    for i in range(n_samples):
        labels, counts = np.unique(preds_array[:, i], return_counts=True)
        majority_label = labels[np.argmax(counts)]
        ensemble_preds.append(majority_label)
    return np.array(ensemble_preds)

# 5. Load data
sentences, tags = load_excel_data("/content/BIES.xlsx")  # update to your file path

# 6. Prepare data
X_all, y_all = prepare_data(sentences, tags)

# 7. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 8. Create pipelines
vec = DictVectorizer(sparse=True)

svc_clf = LinearSVC()
svc_pipeline = make_pipeline(vec, svc_clf)
svc_pipeline.fit(X_train, y_train)
svc_preds = svc_pipeline.predict(X_test)

dt_clf = DecisionTreeClassifier(random_state=42)
dt_pipeline = make_pipeline(vec, dt_clf)
dt_pipeline.fit(X_train, y_train)
dt_preds = dt_pipeline.predict(X_test)

# 9. Ensemble predictions via majority vote
preds = np.vstack([svc_preds, dt_preds])
ensemble_preds = majority_vote(preds)

# 10. Evaluation
print("Classification Report (LinearSVC):")
print(classification_report(y_test, svc_preds))
print(f"Accuracy: {accuracy_score(y_test, svc_preds):.4f}")
print(f"Precision: {precision_score(y_test, svc_preds, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, svc_preds, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, svc_preds, average='weighted'):.4f}")

print("\nClassification Report (Decision Tree):")
print(classification_report(y_test, dt_preds))
print(f"Accuracy: {accuracy_score(y_test, dt_preds):.4f}")
print(f"Precision: {precision_score(y_test, dt_preds, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, dt_preds, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, dt_preds, average='weighted'):.4f}")

print("\nClassification Report (Ensemble):")
print(classification_report(y_test, ensemble_preds))
print(f"Accuracy: {accuracy_score(y_test, ensemble_preds):.4f}")
print(f"Precision: {precision_score(y_test, ensemble_preds, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, ensemble_preds, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, ensemble_preds, average='weighted'):.4f}")


Classification Report (LinearSVC):
              precision    recall  f1-score   support

           B       0.98      0.98      0.98       247
          BO       0.77      0.79      0.78       190
           E       0.89      0.92      0.91       248
          EO       0.96      0.93      0.94       257
           I       0.95      0.85      0.90        46
          IO       0.99      0.99      0.99     10750
           S       1.00      1.00      1.00         4
          SO       0.67      0.57      0.62         7

    accuracy                           0.99     11749
   macro avg       0.90      0.88      0.89     11749
weighted avg       0.99      0.99      0.99     11749

Accuracy: 0.9851
Precision: 0.9852
Recall: 0.9851
F1 Score: 0.9851

Classification Report (Decision Tree):
              precision    recall  f1-score   support

           B       0.99      0.98      0.98       247
          BO       0.66      0.66      0.66       190
           E       0.88      0.92      0.90 

In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from collections import Counter
import numpy as np

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence end
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data
sentences, tags = load_excel_data("/content/BIES.xlsx")  # <-- Update to your file path
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Train LinearSVC model
vec_svm = DictVectorizer(sparse=True)
clf_svm = LinearSVC()
pipeline_svm = make_pipeline(vec_svm, clf_svm)
pipeline_svm.fit(X_train, y_train)
y_pred_svm = pipeline_svm.predict(X_test)

# 6. Train Decision Tree model
vec_dt = DictVectorizer(sparse=True)
clf_dt = DecisionTreeClassifier(random_state=42)
pipeline_dt = make_pipeline(vec_dt, clf_dt)
pipeline_dt.fit(X_train, y_train)
y_pred_dt = pipeline_dt.predict(X_test)

# 7. Ensemble Majority Voting
preds = np.array([y_pred_svm, y_pred_dt])  # shape: (2, n_samples)
preds_T = preds.T  # shape: (n_samples, 2)

ensemble_preds = []
for sample_preds in preds_T:
    most_common = Counter(sample_preds).most_common(1)[0][0]
    ensemble_preds.append(most_common)

ensemble_preds = np.array(ensemble_preds)

# 8. Evaluation Reports

print("Classification Report (LinearSVC):")
print(classification_report(y_test, y_pred_svm))

print("Classification Report (Decision Tree):")
print(classification_report(y_test, y_pred_dt))

print("Classification Report (Ensemble):")
print(classification_report(y_test, ensemble_preds))

print("Ensemble Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, ensemble_preds):.4f}")
print(f"Precision: {precision_score(y_test, ensemble_preds, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, ensemble_preds, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, ensemble_preds, average='weighted'):.4f}")


Classification Report (LinearSVC):
              precision    recall  f1-score   support

           B       0.98      0.98      0.98       247
          BO       0.77      0.79      0.78       190
           E       0.89      0.92      0.91       248
          EO       0.96      0.93      0.94       257
           I       0.95      0.85      0.90        46
          IO       0.99      0.99      0.99     10750
           S       1.00      1.00      1.00         4
          SO       0.67      0.57      0.62         7

    accuracy                           0.99     11749
   macro avg       0.90      0.88      0.89     11749
weighted avg       0.99      0.99      0.99     11749

Classification Report (Decision Tree):
              precision    recall  f1-score   support

           B       0.99      0.98      0.98       247
          BO       0.66      0.66      0.66       190
           E       0.88      0.92      0.90       248
          EO       0.95      0.94      0.94       257
    

##SECOND

In [None]:
# Ensemble of SVM and Decision Tree using VotingClassifier
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
import numpy as np

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Feature extraction per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare dataset
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and split data
sentences, tags = load_excel_data("/content/IO2.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Vectorize features
vec = DictVectorizer(sparse=True)
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# 6. Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# 7. Define base classifiers
svm = LinearSVC(random_state=42)
dt = DecisionTreeClassifier(random_state=42)

# 8. Ensemble via hard voting
ensemble = VotingClassifier(estimators=[
    ('svm', svm),
    ('dt', dt)
], voting='hard')

# 9. Fit and predict
ensemble.fit(X_train_vec, y_train_enc)
y_pred_enc = ensemble.predict(X_test_vec)
y_pred = le.inverse_transform(y_pred_enc)

# 10. Evaluate
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           I       0.93      0.96      0.94       591
           O       1.00      1.00      1.00     11158

    accuracy                           0.99     11749
   macro avg       0.96      0.98      0.97     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9941
Precision: 0.9943
Recall: 0.9941
F1 Score: 0.9942


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder

# 1. Load Excel Data with IOB Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract features per token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare token-level data
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# Load data
sentences, tags = load_excel_data("/content/IOB2.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Encode labels (VotingClassifier doesn't support string labels directly)
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

# Vectorizer
vec = DictVectorizer(sparse=True)
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# Base classifiers
svm = LinearSVC()
tree = DecisionTreeClassifier(random_state=42)

# Voting Classifier (Hard voting)
ensemble = VotingClassifier(estimators=[
    ('svm', svm),
    ('tree', tree)
], voting='hard')

# Train
ensemble.fit(X_train_vec, y_train_enc)

# Predict
y_pred_enc = ensemble.predict(X_test_vec)
y_pred = label_encoder.inverse_transform(y_pred_enc)

# Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report:
              precision    recall  f1-score   support

           B       0.99      0.99      0.99       266
           I       0.89      0.94      0.91       325
           O       1.00      1.00      1.00     11158

    accuracy                           0.99     11749
   macro avg       0.96      0.97      0.97     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9945
Precision: 0.9946
Recall: 0.9945
F1 Score: 0.9945


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load IOE-tagged data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract features
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare flat X and y
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# Load and prepare
sentences, tags = load_excel_data("/content/IOE2.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Vectorize features
vec = DictVectorizer(sparse=True)
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# Define base models
svm = LinearSVC()
tree = DecisionTreeClassifier(random_state=42)

# Ensemble with hard voting
ensemble = VotingClassifier(
    estimators=[
        ('svm', svm),
        ('tree', tree)
    ],
    voting='hard'  # You can use 'soft' if classifiers support predict_proba
)

# Train ensemble
ensemble.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = ensemble.predict(X_test_vec)

print("Classification Report (Ensemble SVM + Decision Tree):")
print(classification_report(y_test, y_pred))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report (Ensemble SVM + Decision Tree):
              precision    recall  f1-score   support

           E       0.89      0.92      0.91       275
           I       0.96      0.98      0.97       316
           O       1.00      1.00      1.00     11158

    accuracy                           0.99     11749
   macro avg       0.95      0.97      0.96     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results:
Accuracy: 0.9943
Precision: 0.9944
Recall: 0.9943
F1 Score: 0.9943


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from collections import Counter

# 1. Load IOBES-tagged Excel data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence boundary
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())
    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract features per word
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare data for ML
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load data
sentences, tags = load_excel_data("/content/IOBES2.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Initialize vectorizer
vec = DictVectorizer(sparse=True)
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# 6. Train classifiers
svm_clf = LinearSVC()
svm_clf.fit(X_train_vec, y_train)
svm_preds = svm_clf.predict(X_test_vec)

tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train_vec, y_train)
tree_preds = tree_clf.predict(X_test_vec)

# 7. Voting ensemble (hard voting)
final_preds = []
for s_pred, t_pred in zip(svm_preds, tree_preds):
    vote = Counter([s_pred, t_pred])
    final_preds.append(vote.most_common(1)[0][0])  # majority vote

# 8. Evaluation
print("=== Ensemble Classification Report ===")
print(classification_report(y_test, final_preds))
print("Accuracy:", accuracy_score(y_test, final_preds))
print("Precision:", precision_score(y_test, final_preds, average='weighted'))
print("Recall:", recall_score(y_test, final_preds, average='weighted'))
print("F1 Score:", f1_score(y_test, final_preds, average='weighted'))


=== Ensemble Classification Report ===
              precision    recall  f1-score   support

           B       0.99      0.99      0.99       262
           E       0.93      0.92      0.92       271
           I       0.86      0.93      0.89        54
           O       1.00      1.00      1.00     11158
           S       1.00      1.00      1.00         4

    accuracy                           1.00     11749
   macro avg       0.96      0.97      0.96     11749
weighted avg       1.00      1.00      1.00     11749

Accuracy: 0.995148523278577
Precision: 0.995171255421157
Recall: 0.995148523278577
F1 Score: 0.9951534727964327


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

# 1. Load Excel Data with IE Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence end
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract token-level features
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features

# 3. Prepare data
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Load and prepare dataset
sentences, tags = load_excel_data("/content/IE2.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 5. Create a DictVectorizer and individual classifiers
vec = DictVectorizer(sparse=True)

svc = LinearSVC()
tree = DecisionTreeClassifier(random_state=42)

# 6. Voting ensemble (hard voting)
from sklearn.ensemble import VotingClassifier

# Wrap the classifiers in their own pipelines
svc_pipeline = Pipeline([("vectorizer", vec), ("svc", svc)])
tree_pipeline = Pipeline([("vectorizer", vec), ("tree", tree)])

# Note: VotingClassifier requires estimators, not pipelines.
# We ensemble on transformed vectors (i.e., after DictVectorizer).

# Transform features
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# Fit VotingClassifier directly on transformed vectors
voting_clf = VotingClassifier(estimators=[
    ("svc", svc),
    ("tree", tree)
], voting='hard')

voting_clf.fit(X_train_vec, y_train)

# 7. Predict and evaluate
y_pred = voting_clf.predict(X_test_vec)

print("Classification Report (Ensemble):")
print(classification_report(y_test, y_pred))

print("Evaluation Results (Ensemble):")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Classification Report (Ensemble):
              precision    recall  f1-score   support

           E       0.89      0.92      0.91       275
          EO       0.94      0.95      0.94       274
           I       0.96      0.99      0.97       316
          IO       1.00      0.99      1.00     10884

    accuracy                           0.99     11749
   macro avg       0.95      0.96      0.95     11749
weighted avg       0.99      0.99      0.99     11749

Evaluation Results (Ensemble):
Accuracy: 0.9917
Precision: 0.9918
Recall: 0.9917
F1 Score: 0.9917


In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from collections import Counter

# Your existing functions for loading data and feature extraction here (load_excel_data, word2features, prepare_data)...

# Load BI-tagged data
sentences, tags = load_excel_data("/content/BI2.xlsx")
X_all, y_all = prepare_data(sentences, tags)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Train SVM pipeline
vec = DictVectorizer(sparse=True)
svm_clf = LinearSVC()
svm_pipeline = make_pipeline(vec, svm_clf)
svm_pipeline.fit(X_train, y_train)

# Train Decision Tree pipeline
dt_clf = DecisionTreeClassifier(random_state=42)
dt_pipeline = make_pipeline(vec, dt_clf)
dt_pipeline.fit(X_train, y_train)

# Get predictions from both models
svm_preds = svm_pipeline.predict(X_test)
dt_preds = dt_pipeline.predict(X_test)

# Ensemble by hard voting (majority vote)
ensemble_preds = []
for p1, p2 in zip(svm_preds, dt_preds):
    votes = [p1, p2]
    most_common_label = Counter(votes).most_common(1)[0][0]
    ensemble_preds.append(most_common_label)

# Evaluation
print("Classification Report for Ensemble:")
print(classification_report(y_test, ensemble_preds))

print("Ensemble Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, ensemble_preds):.4f}")
print(f"Precision: {precision_score(y_test, ensemble_preds, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, ensemble_preds, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, ensemble_preds, average='weighted'):.4f}")


Classification Report for Ensemble:
              precision    recall  f1-score   support

           B       0.99      0.99      0.99       266
          BO       0.77      0.82      0.79       210
           I       0.93      0.92      0.93       325
          IO       0.99      0.99      0.99     10948

    accuracy                           0.99     11749
   macro avg       0.92      0.93      0.93     11749
weighted avg       0.99      0.99      0.99     11749

Ensemble Evaluation Results:
Accuracy: 0.9879
Precision: 0.9881
Recall: 0.9879
F1 Score: 0.9880


In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

# 1. Load Excel with BIES Tags
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # Sentence end
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Extract Features for Each Token
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word': word1,
            '-1:is_title': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word': word1,
            '+1:is_title': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# 3. Format Features and Labels
def prepare_data(sentences, labels):
    X, y = [], []
    for sent, label_seq in zip(sentences, labels):
        for i in range(len(sent)):
            feats = word2features(sent, i)
            X.append(feats)
            y.append(label_seq[i])
    return X, y

# 4. Majority vote for string labels
def majority_vote(preds_array):
    """
    preds_array: numpy array of shape (n_models, n_samples) with string labels
    returns: numpy array of shape (n_samples,) with majority vote per sample
    """
    n_samples = preds_array.shape[1]
    ensemble_preds = []
    for i in range(n_samples):
        labels, counts = np.unique(preds_array[:, i], return_counts=True)
        majority_label = labels[np.argmax(counts)]
        ensemble_preds.append(majority_label)
    return np.array(ensemble_preds)

# 5. Load data
sentences, tags = load_excel_data("/content/BIES2.xlsx")  # update to your file path

# 6. Prepare data
X_all, y_all = prepare_data(sentences, tags)

# 7. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# 8. Create pipelines
vec = DictVectorizer(sparse=True)

svc_clf = LinearSVC()
svc_pipeline = make_pipeline(vec, svc_clf)
svc_pipeline.fit(X_train, y_train)
svc_preds = svc_pipeline.predict(X_test)

dt_clf = DecisionTreeClassifier(random_state=42)
dt_pipeline = make_pipeline(vec, dt_clf)
dt_pipeline.fit(X_train, y_train)
dt_preds = dt_pipeline.predict(X_test)

# 9. Ensemble predictions via majority vote
preds = np.vstack([svc_preds, dt_preds])
ensemble_preds = majority_vote(preds)

# 10. Evaluation
print("Classification Report (LinearSVC):")
print(classification_report(y_test, svc_preds))
print(f"Accuracy: {accuracy_score(y_test, svc_preds):.4f}")
print(f"Precision: {precision_score(y_test, svc_preds, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, svc_preds, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, svc_preds, average='weighted'):.4f}")

print("\nClassification Report (Decision Tree):")
print(classification_report(y_test, dt_preds))
print(f"Accuracy: {accuracy_score(y_test, dt_preds):.4f}")
print(f"Precision: {precision_score(y_test, dt_preds, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, dt_preds, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, dt_preds, average='weighted'):.4f}")

print("\nClassification Report (Ensemble):")
print(classification_report(y_test, ensemble_preds))
print(f"Accuracy: {accuracy_score(y_test, ensemble_preds):.4f}")
print(f"Precision: {precision_score(y_test, ensemble_preds, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, ensemble_preds, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, ensemble_preds, average='weighted'):.4f}")


Classification Report (LinearSVC):
              precision    recall  f1-score   support

           B       0.99      0.99      0.99       262
          BO       0.78      0.82      0.80       202
           E       0.93      0.92      0.92       271
          EO       0.96      0.93      0.94       266
           I       0.86      0.93      0.89        54
          IO       0.99      0.99      0.99     10682
           S       1.00      1.00      1.00         4
          SO       0.67      0.50      0.57         8

    accuracy                           0.99     11749
   macro avg       0.90      0.88      0.89     11749
weighted avg       0.99      0.99      0.99     11749

Accuracy: 0.9854
Precision: 0.9856
Recall: 0.9854
F1 Score: 0.9855

Classification Report (Decision Tree):
              precision    recall  f1-score   support

           B       0.99      0.98      0.99       262
          BO       0.72      0.68      0.70       202
           E       0.90      0.91      0.90 