In [3]:
import json
import pandas as pd
import numpy as np
import nltk
from nltk import FreqDist
import gensim.downloader as api
import random
import gc
from nltk.stem import WordNetLemmatizer

# 1. Jokes dataset

# 1.1 Cleaning raw reddit crawl file
Removes unnecessary fields from the submissions and removes submissions that have been deleted or have no body.

In [None]:
# Cleaning raw reddit crawl results - removes deleted submissions and removes unnecessary response values.
def clean_crawl_results(source_fn, destination_fn):
    crawled_df = pd.read_json(source_fn,orient='records')
    
    # Not all columns are needed for further analysis.
    crawled_df.drop(columns=['created_utc', "author", "subreddit"], inplace=True)
    
    # Remove submissions, that have had their title or body removed.
    crawled_df = crawled_df[~crawled_df['body'].isin(["","[removed]","[deleted]"])]
    crawled_df = crawled_df[~crawled_df['title'].isin(["","[removed]","[deleted]"])]
    
    # Write json to an xz archive. xz seems to offer best compression ratio on these json files.
    crawled_df.to_json(path_or_buf=destination_fn,orient='records',indent=4,compression="infer")

# src = "data/raw_files/reddit_jokes_until_18-01-2021_uncleaned.json.xz"
# dst = "data/cleaned_files/reddit_jokes.json.xz"
# clean_crawl_results(source_fn=src, destination_fn=dst)

# 1.2 Reddit dataset: cleaning
Assumes that the initial cleaning from last block has been done.  
The following blocks:
- Remove anything from the post following "edit: ";
- Remove duplicate posts;
- Create a "joke" column to the df by either combining titles with bodies or just returning bodies(if it contains the title);

In [2]:
# Cleaning for jokes dataset
def clean_df(df):
    # Remove content after edit
    df["title"] = df["title"].str.replace(r'edit:.*', '')
    df["body"] = df["body"].str.replace(r'edit:.*', '')
    
    # Creating "joke" column
    df["joke"] = np.where(df["title"].str[:10] != df["body"].str[:10], df["title"] + " " + df["body"], df["body"])
    
    return df

# Read json, that has reddit submissions as "title" and "body", combine them into column "joke" and 
# remove duplicates.
def read(json_filename):
    df = pd.read_json(path_or_buf=json_filename,orient='records',compression="infer")
    df = clean_df(df)
    
    return df

In [90]:
# Reading [["title" and "body"]]
# df = pd.read_json(path_or_buf="data/reddit_jokes.json.xz",orient='records')

try:
    # Reading [["title" and "body"]] and adding "joke" column
    jokes_df = read("data/cleaned_files/reddit_jokes.json.xz")
    jokes_df[["text", "score"]] = jokes_df[["joke","score"]]
    
    # Option to save all jokes (~1mil)
    # jokes_df[["text"]].to_json(path_or_buf="data/for_team/jokes_all.json.xz")
    
    # Discart jokes with score < 1 (~200k discarded)
    jokes_df = jokes_df[jokes_df["score"]>0]
    jokes_df.reset_index(drop=True, inplace=True)
    
    jokes_df.to_json("")
    jokes_df[["text"]].to_json(path_or_buf="data/for_team/jokes.json.xz")
    
    del jokes_df

except Exception as e:
    del jokes_df
    gc.collect()
    raise e
    

# 2 Non-joke datasets
Ideas:  
https://lionbridge.ai/datasets/15-best-chatbot-datasets-for-machine-learning/  
https://ai.google.com/research/NaturalQuestions

# 2.1 Google QA dataset: cleaning
Cleaning the raw file was done in a separate script due to some file streaming errors in notebook.  
Read json where each instance has {"question", "long_answer"} and combine them into one "text" column.

In [10]:
# Join question/request and answer
def create_text(qa):
    q,a = qa
    q = q.capitalize()
    a = a.capitalize()
    
    # Questions/requests do not have the appropriate puncuations.
    question_words = ["How","What","When","Where","Which","Who","Whose","Why"]    
    if any(q.startswith(qw) for qw in question_words):
        q = q + "?"
    else:
        q = q + "."
        
    return q + " " + a

# Questions with 1-10 sentences as answers seem to average out to same mean length as jokes.
def shorten(arr):
    end = random.randint(2,11)
    new_arr = arr[0:end]
    return new_arr


try:
    
    qa_df = pd.read_json("data/cleaned_files/google_qa.json.xz")
    
    # Join questions and answers into text
    qa_df[["text"]] = qa_df[["question", "long_answer"]].apply(create_text, axis=1)
    
    qa_df = qa_df[["text"]]

    # Cut the articles to be about the same length of sentences as jokes.    
    print("Started cutting.(tokenization)")
    qa_df["text"] = qa_df["text"].apply(str).apply(nltk.sent_tokenize).apply(shorten).apply(" ".join)
    print("Finished tokenization.")

    qa_df.reset_index(drop=True, inplace=True)
    
    print("Saving to file")
    qa_df.to_json("data/for_team/google_qa.json.xz")
    print("Finished saving to file")
    
    del qa_df

except Exception as e:
    del qa_df
    gc.collect()
    raise e

Started cutting.(tokenization)
Finished tokenization.


# 2.2 News dataset: cleaning
https://components.one/pages/about

In [24]:
# Articles with lengths of 1-10 sentences seem to average out to 4 sentences per article
# Note that sentence length std will still differ from jokes.
def shorten(arr):
    end = random.randint(2,11)
    new_arr = arr[1:end]
    return new_arr

try:
    # Read in all news
    news_df = pd.read_csv("data/all-the-news-2-1.zip")
    print("Finished reading.")
    # Print count of each publications articles
    # news_df.groupby(by='publication').count()["article"]

    # Keep only important columns
    news_df=news_df[["article", "publication"]]

    # Keep only articles from certain publishers
    #publications = ['New Republic', 'TMZ', 'Business Insider', 'Vox', 'CNBC', 'People', 'TechCrunch', 'Refinery 29', 'The Hill', 'Wired', 'Vice News', 'Economist', 'New Yorker', 'Washington Post', 'The New York Times', 'Buzzfeed News', 'Reuters', 'The Verge', 'Fox News', 'Hyperallergic', 'Gizmodo', 'Axios', 'CNN', 'Politico', 'Mashable', 'Vice']
    publications = ["Washington Post", "The New York Times", "Reuters"]
    news_df=news_df[news_df["publication"].str.contains('|'.join(publications),na = False)]
    
    # Drop empty rows
    news_df = news_df[news_df["article"] != ""]
    # Drop NA
    news_df.dropna(inplace=False)
    
    # Shuffle df
    news_df = news_df.sample(frac=1, random_state=0)
    
    # Select first 500,000 articles 
    #(some articles will end up with no content after processing, don't have time to figure out why, 
    # so will just include a margin to have at least 300,000 articles with content by the end).
    news_df = news_df.head(500000)
    
    # Remove publication names from articles
    
    news_df["article"] = news_df["article"].str.replace("|".join(publications), '', case = False, regex=True)
    print("Removed publication names.")

    # Cut the articles to be about the same length of sentences as jokes.
    print("Started cutting sentences.(tokenization)")
    news_df["article"] = news_df["article"].apply(str).apply(nltk.sent_tokenize).apply(shorten).apply(" ".join)
    print("Finished tokenization.")
    
    # Reset id column
    news_df.reset_index(drop=True, inplace=True)
    
    # Leave only article column
    news_df=news_df[["text"]]
    
    # Write to file
    print("Writing to file")
    news_df.to_json(path_or_buf="data/for_team/articles.json.xz",orient='records',indent=4,compression="infer")
    print("Finished processing.")
    
    del news_df

except Exception as e:
    del news_df
    gc.collect()
    raise e
    

  interactivity=interactivity, compiler=compiler, result=result)


Finished reading.
Removed publication names.
Started tokenization.
Finished tokenization.
Writing to file
Finished processing.


# 4. Creating tokenized files

Functions for tokenization

In [12]:
def tokenize(df, column_name):
    df["word_tokenize"] = df[column_name].apply(nltk.word_tokenize)
    print("Finished word tokenization.")
    
    df["sent_tokenize"] = df[column_name].apply(nltk.sent_tokenize)
    print("Finished sentence tokenization.")
    
    return df
    
def print_tokenization_stats(df):
    print(f'Words mean: {df["word_tokenize"].apply(len).mean():.2f}')
    print(f'Words STD: {df["word_tokenize"].apply(len).std():.2f}')
    
    print(f'Sentences mean: {df["sent_tokenize"].apply(len).mean()}')
    print(f'Sentences STD: {df["sent_tokenize"].apply(len).std():.2f}')

def create_tokenized_file(source_fn, destination_fn):
    df = pd.read_json(source_fn)
    df = tokenize(df, "text")
    print_tokenization_stats(df)
    print(f'Writing to file')
    df.to_json(destination_fn,orient='records',compression="infer")

### Jokes

In [16]:
src = "data/for_team/jokes.json.xz"
dest = "data/stats_files/tokenized_jokes.json.xz"
create_tokenized_file(source_fn=src, destination_fn=dest)

Finished word tokenization.
Finished sentence tokenization.
Words mean: 55.71
Words STD: 117.59
Sentences mean: 3.8667775796982577
Sentences STD: 7.75


### News articles

In [37]:
src = "data/for_team/articles.json.xz"
dest = "data/stats_files/tokenized_news.json.xz"
create_tokenized_file(source_fn=src, destination_fn=dest)

Finished word tokenization.
Finished sentence tokenization.
Words mean: 140.06
Words STD: 92.44
Sentences mean: 4.842046666666667
Sentences STD: 2.79


### Google QA

In [13]:
src = "data/for_team/google_qa.json.xz"
dest = "data/stats_files/tokenized_google_qa.json.xz"
create_tokenized_file(source_fn=src, destination_fn=dest)

Finished word tokenization.
Finished sentence tokenization.
Words mean: 127.61
Words STD: 394.49
Sentences mean: 3.8504325363646124
Sentences STD: 1.92


# 5. Adding lemmatization to tokenized files

In [14]:
class Lemmatizer:
    def __init__(self):
        self.internal_lemmatizer = WordNetLemmatizer()
    
    def lemmatize_arr(self, tokens):
        lemmas = [self.internal_lemmatizer.lemmatize(t.lower()) for t in tokens]
        return lemmas

def create_lemmatized_file(source_fn, destination_fn):
    lemmatizer = Lemmatizer()
    df = pd.read_json(source_fn)
    
    df["lemmatize"] = df["word_tokenize"].apply(lemmatizer.lemmatize_arr)
    print(f'Finished lemmatizing file "{source_fn}"')
    
    df.to_json(destination_fn,orient='records',compression="infer")
    print(f'Finished writing to file "{destination_fn}"')
    

### Jokes

In [17]:
src = f'data/stats_files/tokenized_jokes.json.xz'
dest = f'data/stats_files/tokenized_lemmatized_jokes.json.xz'
create_lemmatized_file(source_fn=src, destination_fn=dest)

Finished lemmatizing file "data/stats_files/tokenized_jokes.json.xz"
Finished writing to file "data/stats_files/tokenized_lemmatized_jokes.json.xz"


### News

In [None]:
src = f'data/stats_files/tokenized_news.json.xz'
dest = f'data/stats_files/tokenized_lemmatized_news.json.xz'
create_lemmatized_file(source_fn=src, destination_fn=dest)

### Google QA

In [15]:
src = f'data/stats_files/tokenized_google_qa.json.xz'
dest = f'data/stats_files/tokenized_lemmatized_google_qa.json.xz'
create_lemmatized_file(source_fn=src, destination_fn=dest)

Finished lemmatizing file "data/stats_files/tokenized_google_qa.json.xz"
Finished writing to file "data/stats_files/tokenized_lemmatized_google_qa.json.xz"
