In [1]:
import pandas as pd
import spacy
import re
import emoji
import csv

from spacy.pipeline.lemmatizer import lemmatizer_score

dataset = pd.read_csv("./IMDB Dataset.csv")
print(dataset.describe())

file_to_write = open("./IMDB Dataset_cleaned.csv", "w")
new_csv = csv.writer(file_to_write)
new_csv.writerow(["review", "sentiment"])

# Load the model
nlp = spacy.load("en_core_web_md")

In [2]:
# Filter unwanted tokens, special chars, numbers, and extra spaces
# also remove stop words, as well as converting to lowercase
# it will also remove emoticon as it is a punctuation
def clean_token(doc):
    cleaned_tokens = []
    for token in doc:
        if (not token.is_punct and not token.is_space
                and not token.is_stop and not token.is_quote and token.is_ascii and not token.like_num):
            # lemmatize the token
            lemmatized_token = token.lemma_
            cleaned_tokens.append(lemmatized_token.replace("--", "").removesuffix("-").removeprefix("-").lower())
    return cleaned_tokens

In [3]:
def process_text(text):
    # Remove HTML like tags
    regex_remove_html = re.compile(r'<.*?>')
    text = regex_remove_html.sub('', text)
    # convert unicode emojis to words
    text = emoji.demojize(text)
    doc = nlp(text)
    cleaned_sentences = []
    for sent in doc.sents:
        cleaned_tokens = clean_token(sent)
        cleaned_sentences.append(" ".join(cleaned_tokens))
    return "\n".join(cleaned_sentences)


for index, review in enumerate(dataset['review']):
    processed_text = process_text(review)
    new_csv.writerow([processed_text, dataset.get('sentiment')[index]])

In [4]:
file_to_write.close()