Copyright 2024 Gabriel Lindenmaier

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [None]:
import os
import sys

# In case of Jupyter notebooks leave out the __file__ variable.
# AND ensure that the combination of ".." leads to the root directory
project_root_path = os.path.realpath(os.path.join("../"))
sys.path.append(project_root_path)

import re
import pandas as pd
import numpy as np
import sqlite3

from pathlib import Path

from src.utils.settings import Config
from src.data.data_cleaning import DataCleaner
from src.data.vocab_coverage import VocabCoverage

# Constants

In [None]:
cpu_cores = Config.hardware.n_cpu
path_glove = Path(Config.path.data_external) / Config.path.glove_file

# Data Loading

In [None]:
data_file = Config.path.data_folder
data_base = Config.path.data_base
sql_query = """SELECT c.body as 'story', s.title as 'prompt', s.selftext as 'prompt_body'
                    , s.score as 'prompt_score', c.score as 'story_score'
FROM submissions as s, comments as c
where s.score > 0 and s.num_comments > 0 and s.nsfw = 0
    and (s.title like '%[WP]%' or s.title like '%[SP]%' or s.title like '%[RF]%' or s.title like '%[TT]%')
    and s.title not like '%[EU]%' and s.title not like '%[CW]%' and s.title not like '%[IP]%'
    and s.title not like '%[MP]%' and s.title not like '%[PI]%' and s.title not like '%[PM]%'
    and s.title not like '%[CC]%' and s.title not like '%[OT]%'
    and s.idstr = c.parent 
    and c.score > 0 and c.textlen > 400
    and c.author not like 'WritingPromptsRobot' 
    and c.author not like 'WritingPromptsBot'
    and c.author not like 'AutoModerator'
    and c.author not like 'TotesMessenger'
    and c.author not like 'totes_meta_bot'
    and c.author not like 'PlaylisterBot'
    and c.author not like 'LazyLinkerBot'
order by c.score DESC, s.score DESC, c.textlen DESC;"""

In [None]:
%%time
conn = sqlite3.connect(data_base)
data = pd.read_sql_query(sql_query, conn)

In [None]:
data.describe()  # 466,886 submissions

In [None]:
data.head()

# Data Cleaning

In [None]:
%%time
# Wall time: 2min 57s
cleaner = DataCleaner()
cleaner.clean_data(df=data, cpu_cores=cpu_cores)

In [None]:
data.head()

In [None]:
data.describe()  # 457,655

# Test - For Finding New Things To Clean

In [None]:
%%time
# To find confusable Unicode characters: https://unicode.org/cldr/utility/confusables.jsp?a=%3F&r=None
# Misspellings list: https://en.wikipedia.org/wiki/Commonly_misspelled_English_words
column = 'story'
ptrn = False

if ptrn:
    pattern = re.compile(r'2FWritingPrompts', re.IGNORECASE)
    idx = data[data[column].map(lambda s: re.search(pattern, s) is not None)].index
else:
    pattern = None
    idx = data.index
index = np.zeros_like(idx)
index[:] = idx
print(len(index))

In [None]:
np.random.shuffle(index)

for prompt, descr, story in zip(data["prompt"][index[:1]], data["prompt_body"][index[:1]], data["story"][index[:1]]):
    sep = 80 * '~' + '\n'
    print(prompt)
    print(sep)

    if pattern is not None:
        mo = re.findall(pattern, story)
        if mo is not None:
            for match in mo:
                print(match)
            print(sep)

    print(descr)
    print(sep)
    print(story)

## Find Strange Words, Signs & Misspellings

In [None]:
%%time
# Wall time: 3min
vocab_cover = VocabCoverage()
l_data = [data["prompt_body"], data["prompt"], data["story"]]
oov_glove = vocab_cover.calculate_oov(l_data, path_glove, vector_count=2196017)
# Added 89,777 tokens to vocab
# Found tokens for 47.37% of d_vocab
# Found tokens for 99.64% of all text

In [None]:
#oov_glove

## Write Cleaned Data Into Database

In [None]:
%%time
# Wall time: 5.15 s
data.to_sql('cleaned', conn, if_exists='replace')  # , if_exists='replace'