In [52]:
import pandas as pd
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import pickle
import nltk
from nltk.tokenize import word_tokenize

In [53]:
# Ensure NLTK is ready
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [54]:
#Reading the csv file
train_data = pd.read_csv('data/data_train.csv', encoding = "ISO-8859-1")
test_data = pd.read_csv('data/data_test.csv', encoding = "ISO-8859-1")

In [55]:
print(train_data.head())
print(test_data.head())
train_data.shape

     token label  token_length
0       In     O             2
1     this     O             4
2  article     O             7
3       we     O             2
4  discuss     O             7
         token label  token_length
0      Systems     O             7
1        based     O             5
2           on     O             2
3  synchronous     B            11
4     grammars     I             8


(26742, 3)

In [56]:
train_data.describe()

Unnamed: 0,token_length
count,26742.0
mean,5.180353
std,3.36148
min,1.0
25%,2.0
50%,4.0
75%,8.0
max,44.0


In [57]:
train_data['label'].unique()

array(['O', 'B', 'I'], dtype=object)

In [58]:
train_data.isnull().sum()

token           0
label           0
token_length    0
dtype: int64

In [59]:
# Ensure the datasets have a 'sentence_id' column for grouping tokens
if 'sentence_id' not in train_data.columns:
    train_data['sentence_id'] = (train_data['token'] == '.').cumsum()

if 'sentence_id' not in test_data.columns:
    test_data['sentence_id'] = (test_data['token'] == '.').cumsum()

In [60]:
# Function to extract features for a token
def extract_features(df, i):
    token = df.iloc[i]['token']
    token_length = df.iloc[i]['token_length']
    
    features = {
        'token': token.lower(),
        'is_upper': token.isupper(),
        'is_title': token.istitle(),
        'is_digit': token.isdigit(),
        'token_length': token_length,
    }
    
    if i > 0:
        prev_token = df.iloc[i - 1]['token']
        features.update({
            '-1:token': prev_token.lower(),
            '-1:is_upper': prev_token.isupper(),
            '-1:is_title': prev_token.istitle(),
            '-1:is_digit': prev_token.isdigit(),
        })
    else:
        features['BOS'] = True  # Beginning of Sentence

    if i < len(df) - 1:
        next_token = df.iloc[i + 1]['token']
        features.update({
            '+1:token': next_token.lower(),
            '+1:is_upper': next_token.isupper(),
            '+1:is_title': next_token.istitle(),
            '+1:is_digit': next_token.isdigit(),
        })
    else:
        features['EOS'] = True  # End of Sentence

    return features

# Function to prepare data for CRF
def create_dataset(df):
    grouped = df.groupby('sentence_id')  # Group by sentence ID
    X, y = [], []

    for _, group in grouped:
        X.append([extract_features(group, i) for i in range(len(group))])
        y.append(group['label'].tolist())

    return X, y


In [61]:
# Create features and labels for train and test datasets
X_train, y_train = create_dataset(train_data)
X_test, y_test = create_dataset(test_data)

In [62]:
# Train a CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
)
crf_model = crf.fit(X_train, y_train)

In [63]:
# Evaluate the model
y_pred = crf_model.predict(X_test)
labels = list(crf.classes_)
labels.remove('O')  # Remove 'O' from evaluation

In [65]:
print("F1 Score:", metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_test, y_pred, labels=labels))

F1 Score: 0.6628394619979354
              precision    recall  f1-score   support

           B       0.67      0.53      0.59       445
           I       0.75      0.71      0.73       482

   micro avg       0.71      0.62      0.67       927
   macro avg       0.71      0.62      0.66       927
weighted avg       0.71      0.62      0.66       927



In [66]:
# Save the trained model
with open("crf_model.pkl", "wb") as f:
    pickle.dump(crf, f)

In [67]:
# Function for inference on raw unseen text
def preprocess_raw_text(raw_text):
    """
    Preprocess raw text to prepare it for CRF prediction.
    :param raw_text: A string of raw text.
    :return: A DataFrame with 'token' and 'token_length' columns.
    """
    tokens = word_tokenize(raw_text)  # Tokenize the text
    token_lengths = [len(token) for token in tokens]
    
    return pd.DataFrame({
        'token': tokens,
        'token_length': token_lengths
    })


In [68]:
def predict_raw_text(raw_text, model):
    """
    Predict labels for raw text input.
    :param raw_text: A string of raw text.
    :param model: A trained CRF model.
    :return: DataFrame with tokens and their predicted labels.
    """
    data = preprocess_raw_text(raw_text)  # Preprocess raw text
    data['sentence_id'] = (data['token'] == '.').cumsum()  # Create sentence IDs
    
    grouped = data.groupby('sentence_id')
    X_unseen = [[extract_features(group, i) for i in range(len(group))] for _, group in grouped]
    predictions = model.predict(X_unseen)
    
    # Add predictions to the DataFrame
    data['predicted_label'] = [label for sentence in predictions for label in sentence]
    return data

In [71]:
# Example raw unseen text
raw_text = "This is an example sentence for named entity recognition testing."

# Predict labels for unseen text
result = predict_raw_text(raw_text, crf_model)

# Display the result
print(result[['token', 'predicted_label']])

          token predicted_label
0          This               O
1            is               O
2            an               O
3       example               O
4      sentence               B
5           for               O
6         named               B
7        entity               I
8   recognition               I
9       testing               O
10            .               O


In [72]:
# Example raw unseen text
raw_text = "Natural language processing (NLP) combines computational linguistics, machine learning, and deep learning models."

# Predict labels for unseen text
result = predict_raw_text(raw_text, crf_model)

# Display the result
print(result[['token', 'predicted_label']])

            token predicted_label
0         Natural               B
1        language               I
2      processing               I
3               (               O
4             NLP               B
5               )               O
6        combines               O
7   computational               B
8     linguistics               I
9               ,               O
10        machine               B
11       learning               I
12              ,               O
13            and               O
14           deep               B
15       learning               I
16         models               I
17              .               O
