In [5]:
import pandas as pd
import numpy as np
import random
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

# Custom defaultdict implementation
class MyDefaultDict(dict):
    def __init__(self, default_factory, *args, **kwargs):
        self.default_factory = default_factory
        super().__init__(*args, **kwargs)

    def __missing__(self, key):
        if self.default_factory is None:
            raise KeyError(key)
        else:
            self[key] = self.default_factory()
            return self[key]

# Read training and test data from CSV files
train_data = pd.read_csv('train.csv')

# Preprocess data
train_sentences = train_data['untagged_sentence'].tolist()
train_tags = train_data['tagged_sentence'].tolist()

# Function to convert tagged sentences to NLTK format
def convert_to_nltk_format(tagged_sentence):
    # Extracting the tagged words and tags from the input
    tagged_words = eval(tagged_sentence)
    words = [word for word, tag in tagged_words]
    tags = [tag for word, tag in tagged_words]
    # Combining words and tags into a list of tuples
    nltk_format = list(zip(words, tags))
    return nltk_format

# Convert all tagged sentences to NLTK format
nltk_train_tags = [convert_to_nltk_format(tagged_sentence) for tagged_sentence in train_tags]

random.seed(1234)
train_set = nltk_train_tags

# Prepare the features for the classifier
def features(sentence, index):
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

# Prepare the dataset
X_train = []
y_train = []

for tagged_sentence in nltk_train_tags:
    untagged_sentence = [w for w, t in tagged_sentence]
    for index in range(len(untagged_sentence)):
        X_train.append(features(untagged_sentence, index))
        y_train.append(tagged_sentence[index][1])

# Convert the dictionary of features to a feature vector
vectorizer = DictVectorizer(sparse=True)
X_train = vectorizer.fit_transform(X_train)

# Train the MEMM using logistic regression
clf = LogisticRegression(solver='saga', random_state=1234)

clf.fit(X_train, y_train)

# Use the MEMM for prediction
def predict(sentence):
    tagged_sentence = []
    tags = clf.classes_
    for index in range(len(sentence)):
        features_transformed = vectorizer.transform([features(sentence, index)])
        probabilities = clf.predict_proba(features_transformed)[0]
        max_index = np.argmax(probabilities)
        tagged_sentence.append((sentence[index], tags[max_index]))
    return tagged_sentence




In [8]:
import pandas as pd
import ast
from tqdm import tqdm


# Load your data
test_df = pd.read_csv('test_small.csv')

# Initialize a list to store tagged sentences
tagged_sentences = []

# Iterate over each row in the DataFrame
for index, row in tqdm(test_df.iterrows()):
    # Extract the untagged sentence and its corresponding id
    untagged_sentence = ast.literal_eval(row['untagged_sentence'])
    sentence_id = row['id']
    
    # Use the Viterbi algorithm to tag each word in the sentence
    tagged_sentence = predict(untagged_sentence)
    
    # Combine the id and tagged sentence into a string and append to the list
    tagged_sentences.append({'id': sentence_id, 'tagged_sentence': tagged_sentence})
    

# Convert the list of tagged sentences to a DataFrame
tagged_df = pd.DataFrame(tagged_sentences)

# Save the DataFrame to a CSV file
tagged_df.to_csv('samples.csv', index=False)


4000it [46:16,  1.44it/s]


In [6]:
from nltk.tokenize import word_tokenize
import time
## Testing
sentence_test = 'Twitter is the best networking social site. Man is a social animal. Data science is an emerging field. Data science jobs are high in demand.'
words = word_tokenize(sentence_test)

start = time.time()
tagged_seq = predict(words)
end = time.time()
difference = end-start

print(tagged_seq)
print(difference)

[('Twitter', 'NN'), ('is', 'BE'), ('the', 'AT'), ('best', 'JJ'), ('networking', 'VB'), ('social', 'JJ'), ('site', 'NN'), ('.', '.'), ('Man', 'NN'), ('is', 'BE'), ('a', 'AT'), ('social', 'JJ'), ('animal', 'NN'), ('.', '.'), ('Data', 'NN'), ('science', 'NN'), ('is', 'BE'), ('an', 'AT'), ('emerging', 'VB'), ('field', 'NN'), ('.', '.'), ('Data', 'NN'), ('science', 'NN'), ('jobs', 'NN'), ('are', 'BE'), ('high', 'JJ'), ('in', 'IN'), ('demand', 'NN'), ('.', '.')]
0.879969596862793
