In [16]:
import pandas as pd
import Levenshtein
import nltk
import itertools

def run_sliding_window_through_text(words, window_size):
    """
    Generate a window sliding through a sequence of words
    """
    word_iterator = iter(words)
    word_window = tuple(itertools.islice(word_iterator, window_size))
    yield word_window
    for w in word_iterator:
        word_window = word_window[1:] + (w,)
        yield word_window

def match_dict_similarity(text, expressions):
    threshold = 0.75
    max_similarity_obtained = -1
    best_match = ''
    for exp in expressions:
        size_of_window = len(exp.split())
        tokenized_text = list(nltk.word_tokenize(text))
        for window in run_sliding_window_through_text(tokenized_text, size_of_window):
            window_string = ' '.join(window)
            similarity_score = Levenshtein.ratio(window_string, exp)

            if similarity_score >= threshold:
                if similarity_score > max_similarity_obtained:
                    max_similarity_obtained = similarity_score
                    best_match = window_string
    return best_match, max_similarity_obtained

# Load the expressions from the lexicon file
lexicon_file_path = 'COVID-Twitter-Symptom-Lexicon.txt'
expressions = []
with open(lexicon_file_path) as infile:
    for line in infile:
        items = line.strip().split('\t')
        if len(items) > 1:
            expressions.append(str.strip(items[-1]))

# Load the Excel sheet
excel_file_path = 'UnlabeledSet (2).xlsx'
df = pd.read_excel(excel_file_path)

# Iterate through each row and perform matching
for index, row in df.iterrows():
    text = str(row['TEXT'])  # Assuming 'TEXT' is the column name
    if pd.notna(text) and text.strip() != '':
        best_match, max_similarity = match_dict_similarity(text, expressions)
        df.at[index, 'Best Match'] = best_match
        df.at[index, 'Max Similarity'] = max_similarity

# Save the modified DataFrame to a new Excel file
output_excel_path = 'UNLABELED_RESULT.xlsx'
df.to_excel(output_excel_path, index=False)
print('Output saved to Excel file:', output_excel_path)


  df.at[index, 'Best Match'] = best_match


Output saved to Excel file: UNLABELED_RESULT.xlsx
