In [6]:
#Cleaning of Texts/Preparation for Machine Learning 
import ast, os, time
from gensim.parsing.preprocessing import remove_stopwords
import nltk 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from langdetect import detect
from langdetect import LangDetectException

def replaceNonsense(text):
    #removes structural garbage from files including *, (@), ., and numbers
    structural_garbage = ["(@)", "@", ".", "(...)", "*", "1", "2", "3",
    "4", "5", "6", "7", "8", "9", "0", "()", "!", "?", ",", "(", ")", "\n", "*"]
    for trash in structural_garbage:
        text = text.replace(trash, "")
    return text

def removeTooSmall(text):
    text = text.split(" ")
    #removes words 2 letters or smaller, used after removeStopwords to catch
    for word in text:
        if len(word) <= 2:
            text.remove(word)
    text = " ".join(text)
    return text

stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", 
              "yours", "yourself", "yourselves", "it", "its", "itself", "they", "them", "their", "theirs", 
              "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", 
              "are", "was", "were", "be", "been", "being", "have", "has", "had", 
              "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", 
              "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", 
              "about", "against", "between", "into", "through", "during", "before", "after", 
              "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", 
              "under", "again", "further", "then", "once", "here", "there", "when", "where", 
              "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", 
              "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", 
              "very", "s", "t", "can", "will", "just", "don", "should", "now", ""]
    
def cleantexts(text): 
    #Tokenizing the text (splitting into a list of words)
    tokens = text.lower().split(" ")
    #Removing stopwords (common words like the, a, to, for etc.)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    filtered_tokens_text = " ".join(filtered_tokens)
    #Removing numbers, structual nonsense, and other miscellanous characters we do not want
    filtered_tokens_text = replaceNonsense(filtered_tokens_text)
    filtered_tokens_text = removeTooSmall(filtered_tokens_text)
    filtered_tokens = filtered_tokens_text.split(" ")
    #Second pass of stopword filtering for empty characters that are created during cleaning
    filtered_tokens = [token for token in filtered_tokens if token not in stop_words]
    #Lemmatize words to get rid of redundant verb tenses, plurals, and linguistic excess 
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return lemmatized_tokens

In [5]:
import pandas as pd
comics_df = pd.read_json('goodreads_reviews_comics_graphic.json', lines=True)

In [7]:
from langdetect import detect, LangDetectException

def detect_non_english(text, index):
    try: 
        language = detect(text)
        if index % 1000 == 0:
            print(f"Processed {index} rows...")
        return language
    except LangDetectException: 
        return "unknown"

comics_df["language"] = comics_df.apply(lambda row: detect_non_english(row["review_text"], row.name), axis=1)
print(comics_df['language'].unique())
english_comics_df = comics_df[comics_df['language'] == 'en'].copy()
english_comics_df["cleaned_review"] = english_comics_df["review_text"].apply(cleantexts)
english_comics_df.drop(columns=['language'], inplace=True)

print(english_comics_df.head())

Processed 0 rows...
Processed 1000 rows...
Processed 2000 rows...
Processed 3000 rows...
Processed 4000 rows...
Processed 5000 rows...
Processed 6000 rows...
Processed 7000 rows...
Processed 8000 rows...
Processed 9000 rows...
Processed 10000 rows...
Processed 11000 rows...
Processed 12000 rows...
Processed 13000 rows...
Processed 14000 rows...
Processed 15000 rows...
Processed 16000 rows...
Processed 17000 rows...
Processed 18000 rows...
Processed 19000 rows...
Processed 20000 rows...
Processed 21000 rows...
Processed 22000 rows...
Processed 23000 rows...
Processed 24000 rows...
Processed 25000 rows...
Processed 26000 rows...
Processed 27000 rows...
Processed 28000 rows...
Processed 29000 rows...
Processed 30000 rows...
Processed 31000 rows...
Processed 32000 rows...
Processed 33000 rows...
Processed 34000 rows...
Processed 35000 rows...
Processed 36000 rows...
Processed 37000 rows...
Processed 38000 rows...
Processed 39000 rows...
Processed 40000 rows...
Processed 41000 rows...
Proce

Processed 335000 rows...
Processed 336000 rows...
Processed 337000 rows...
Processed 338000 rows...
Processed 340000 rows...
Processed 341000 rows...
Processed 342000 rows...
Processed 343000 rows...
Processed 344000 rows...
Processed 345000 rows...
Processed 346000 rows...
Processed 347000 rows...
Processed 348000 rows...
Processed 349000 rows...
Processed 350000 rows...
Processed 351000 rows...
Processed 352000 rows...
Processed 353000 rows...
Processed 354000 rows...
Processed 355000 rows...
Processed 356000 rows...
Processed 357000 rows...
Processed 358000 rows...
Processed 359000 rows...
Processed 360000 rows...
Processed 361000 rows...
Processed 362000 rows...
Processed 363000 rows...
Processed 364000 rows...
Processed 365000 rows...
Processed 366000 rows...
Processed 367000 rows...
Processed 368000 rows...
Processed 369000 rows...
Processed 370000 rows...
Processed 371000 rows...
Processed 372000 rows...
Processed 373000 rows...
Processed 374000 rows...
Processed 375000 rows...


In [8]:
english_comics_df.to_csv('english_comics_reviews_clean.csv', index=False)

In [11]:
import pandas as pd
children_df = pd.read_json('goodreads_reviews_children.json', lines=True)

In [13]:
from langdetect import detect, LangDetectException

def detect_non_english(text, index):
    try: 
        language = detect(text)
        if index % 1000 == 0:
            print(f"Processed {index} rows...")
        return language
    except LangDetectException: 
        return "unknown"

children_df["language"] = children_df.apply(lambda row: detect_non_english(row["review_text"], row.name), axis=1)
print(children_df['language'].unique())
english_children_df = children_df[children_df['language'] == 'en'].copy()
english_children_df["cleaned_review"] = english_children_df["review_text"].apply(cleantexts)
english_children_df.drop(columns=['language'], inplace=True)

print(english_children_df.head())

Processed 0 rows...
Processed 1000 rows...
Processed 2000 rows...
Processed 3000 rows...
Processed 4000 rows...
Processed 5000 rows...
Processed 6000 rows...
Processed 7000 rows...
Processed 8000 rows...
Processed 9000 rows...
Processed 10000 rows...
Processed 11000 rows...
Processed 12000 rows...
Processed 13000 rows...
Processed 14000 rows...
Processed 15000 rows...
Processed 16000 rows...
Processed 17000 rows...
Processed 18000 rows...
Processed 19000 rows...
Processed 20000 rows...
Processed 21000 rows...
Processed 22000 rows...
Processed 23000 rows...
Processed 24000 rows...
Processed 25000 rows...
Processed 26000 rows...
Processed 27000 rows...
Processed 28000 rows...
Processed 29000 rows...
Processed 30000 rows...
Processed 31000 rows...
Processed 32000 rows...
Processed 33000 rows...
Processed 34000 rows...
Processed 35000 rows...
Processed 36000 rows...
Processed 37000 rows...
Processed 38000 rows...
Processed 39000 rows...
Processed 40000 rows...
Processed 41000 rows...
Proce

Processed 336000 rows...
Processed 337000 rows...
Processed 338000 rows...
Processed 339000 rows...
Processed 340000 rows...
Processed 341000 rows...
Processed 342000 rows...
Processed 343000 rows...
Processed 344000 rows...
Processed 345000 rows...
Processed 346000 rows...
Processed 347000 rows...
Processed 348000 rows...
Processed 349000 rows...
Processed 350000 rows...
Processed 351000 rows...
Processed 352000 rows...
Processed 353000 rows...
Processed 354000 rows...
Processed 355000 rows...
Processed 356000 rows...
Processed 357000 rows...
Processed 359000 rows...
Processed 360000 rows...
Processed 361000 rows...
Processed 362000 rows...
Processed 363000 rows...
Processed 364000 rows...
Processed 365000 rows...
Processed 366000 rows...
Processed 367000 rows...
Processed 368000 rows...
Processed 369000 rows...
Processed 370000 rows...
Processed 371000 rows...
Processed 372000 rows...
Processed 373000 rows...
Processed 374000 rows...
Processed 375000 rows...
Processed 376000 rows...


Processed 666000 rows...
Processed 667000 rows...
Processed 668000 rows...
Processed 669000 rows...
Processed 670000 rows...
Processed 671000 rows...
Processed 672000 rows...
Processed 673000 rows...
Processed 674000 rows...
Processed 675000 rows...
Processed 676000 rows...
Processed 677000 rows...
Processed 678000 rows...
Processed 679000 rows...
Processed 680000 rows...
Processed 681000 rows...
Processed 682000 rows...
Processed 683000 rows...
Processed 684000 rows...
Processed 685000 rows...
Processed 686000 rows...
Processed 687000 rows...
Processed 688000 rows...
Processed 689000 rows...
Processed 690000 rows...
Processed 691000 rows...
Processed 692000 rows...
Processed 693000 rows...
Processed 694000 rows...
Processed 695000 rows...
Processed 696000 rows...
Processed 697000 rows...
Processed 698000 rows...
Processed 699000 rows...
Processed 700000 rows...
Processed 701000 rows...
Processed 702000 rows...
Processed 703000 rows...
Processed 704000 rows...
Processed 705000 rows...


In [14]:
english_children_df.to_csv('english_children_reviews_clean.csv', index=False)