# Exploring the statistics of jokes and none-jokes

In [14]:
# Data storage
import json
import pandas as pd
import numpy as np

# Language processing
import nltk
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer

#Other
import random
import gc
import string

In [15]:
# General stats about the count of words and sentences
def print_tokenization_stats(df):
    print(f'Text instances: {df.shape[0]:,}\n')
    
    word_lengths_series = df["word_tokenize"].apply(len)
    print(f'Words total: {word_lengths_series.sum():,}')
    print(f'Words mean: {word_lengths_series.mean():.2f}')
    print(f'Words std: {word_lengths_series.std():.2f}')
    print()
      
    sentence_lengths_series = df["sent_tokenize"].apply(len)
    print(f'Sentences total: {sentence_lengths_series.sum():,}')
    print(f'Sentences mean: {sentence_lengths_series.mean():.2f}')
    print(f'Sentences std: {sentence_lengths_series.std():.2f}')
    print()
    

def repeating_words_and_lemmas(df):
    def unique_proportion(arr):
        new_arr = [x for x in arr if x not in string.punctuation]
        
        if len(new_arr) == 0:
            return 0
        
        return len(set(new_arr))/len(new_arr)
    
    unique_words_proportion = df["word_tokenize"].apply(unique_proportion).mean()
    print(f'Mean proportion of unique words in text instance: {unique_words_proportion:.4f}.')

    unique_lemmas_proportion = df["lemmatize"].apply(unique_proportion).mean()    
    print(f'Mean proportion of unique lemmas in text instance: {unique_lemmas_proportion:.4f}.')
    print()
    

# Creates and returns FreqDist's for use in later cells
def word__frequency_analysis(df, lower=False, lemmatize=False):
    freq_dist = FreqDist()
    unique_freq_dist = FreqDist()

    if lemmatize:
        token_column = "lemmatize"
    else:
        token_column = "word_tokenize"

    for tokens in df[token_column]:
        if lower:
            tokens = [t.lower() for t in tokens]

        freq_dist.update(tokens)
        unique_freq_dist.update(set(tokens))
            
    
        
    print("General frequencies:")
    freq_dist.pprint(20)
    print("\nUnique frequencies(each token counted only once per text instance):")
    unique_freq_dist.pprint(20)
    print()
    
    return freq_dist, unique_freq_dist


# What part of text instances contain certain most common words
def word_coverage_analysis(df, freq_dist, unique_freq_dist):
    for n_most_common in [10,50,100]:
        freq_sum = sum(unique_freq_dist.freq(token[0]) for token in unique_freq_dist.most_common(n_most_common))
        print(f'Top {n_most_common} most frequent words make up {freq_sum*100:.2f}% of all words.')
    
    print()
    
# Combines the functions above
def analyze_file(filename):
    df = pd.read_json(filename)
    
    print_tokenization_stats(df)
    
    repeating_words_and_lemmas(df)
    
    freq_dist, unique_freq_dist = word__frequency_analysis(df, lemmatize=True)
    
    word_coverage_analysis(df, freq_dist, unique_freq_dist)
    
    return freq_dist, unique_freq_dist

### Reading data from tokenized files

In [16]:
%%time
files = ["jokes", "news", "google_qa"]
freq_dists = {}
unique_freq_dists = {}

for f in files:
    print(f)
    freq_dist, unique_freq_dist = analyze_file(f'data/stats_files/tokenized_lemmatized_{f}.json.xz')
    freq_dists[f] = freq_dist
    unique_freq_dists[f] = unique_freq_dist
    print("\n--------------------------------------------------\n")

jokes
Text instances: 772,978

Words total: 43,063,137
Words mean: 55.71
Words std: 117.59

Sentences total: 2,988,934
Sentences mean: 3.87
Sentences std: 7.75

Mean proportion of unique words in text instance: 0.8591.
Mean proportion of unique lemmas in text instance: 0.8380.

General frequencies:
FreqDist({'the': 2032115, '.': 1861586, ',': 1710063, 'a': 1368527, 'and': 945609, 'to': 862682, 'i': 817273, "''": 654350, '?': 597624, '``': 596776, 'you': 591393, 'he': 507199, 'of': 441433, 'it': 432912, 'in': 430020, 'is': 335333, '!': 324761, 'his': 305537, 'that': 304498, 'what': 298114, ...})

Unique frequencies(each token counted only once per text instance):
FreqDist({'.': 522758, 'a': 473859, 'the': 443916, '?': 412893, 'i': 325779, 'to': 319681, ',': 316927, 'and': 287260, 'you': 284807, 'what': 239446, 'it': 237791, 'in': 228834, 'of': 221014, 'is': 190367, 'do': 187011, 'my': 177444, "'s": 177319, 'he': 174033, 'that': 165475, "''": 164576, ...})

Top 10 most frequent words mak

## Differences in most frequent words
Unique nltk.word_tokenize() tokens among the {nr} most common tokens in each dataset.

In [17]:
# most_common_count - Number of most common words taken from each dataset
def token_differences(freq_dists, unique_freq_dists, most_common_count = 100):
    most_common_uniques = {}
    
    for key in unique_freq_dists.keys():
        uniques = set(x[0] for x in unique_freq_dists[key].most_common(most_common_count))
        most_common_uniques[key] = uniques
        
        
    print(f'In joke but not in others: \n{most_common_uniques["jokes"] - most_common_uniques["news"] - most_common_uniques["google_qa"]}')
    print()
    print(f'In news but not in others: \n{most_common_uniques["news"] - most_common_uniques["jokes"] - most_common_uniques["google_qa"]}')
    print()
    print(f'In qa but not in others: \n{most_common_uniques["google_qa"] - most_common_uniques["jokes"] - most_common_uniques["news"]}')

# This number can be changed
most_common_count = 200
print(f'Tokens that appear among the {most_common_count} most frequent words in dataset, but not among the top {most_common_count} of other 2 datasets.\n')
token_differences(freq_dists, unique_freq_dists, most_common_count)

Tokens that appear among the 200 most frequent words in dataset, but not among the top 200 of other 2 datasets.

In joke but not in others: 
{'look', 'once', 'another', 'replied', 'am', 'asked', 'very', 'see', 'ever', 'again', 'every', 'go', 'man', 'friend', 'me', 'get', 'give', 'thing', 'guy', 'bar', 'put', 'why', 'few', 'wife', 'sex', 'think', 'know', 'my', 'hear', 'old', 'girl', "'re", 'came', 'head', 'need', 'sure', '*', 'thought', 'joke', 'let', 'find', 'tell', "'ve", 'going', '!', 'call', 'night', 'little', 'turn', 'should', '...', 'too', "'ll", 'went', "n't", 'today', 'walk', 'difference', 'really', 'oh', 'start', 'always', "'m", 'woman', 'long', 'ca', 'good', 'want', 'your', 'hand', 'reply', 'got', 'asks', 'never'}

In news but not in others: 
{'against', 'global', 'statement', 'week', 'according', 'billion', '%', 'public', 'economy', 'tuesday', '—', 'government', 'share', 'set', 'trump', 'expected', 'month', 'reporting', 'editing', 'source', 'news', 'friday', 'price', 'officia

### Now between jokes and non-jokes

In [19]:
# most_common_count - Number of most common words taken from each dataset
def token_differences(freq_dists, unique_freq_dists, most_common_count = 100):
    most_common_uniques = {}
    
    
    most_common_uniques["jokes"] = set(x[0] for x in unique_freq_dists["jokes"].most_common(most_common_count))
    most_common_uniques["google_qa"] = set(x[0] for x in unique_freq_dists["google_qa"].most_common(int(most_common_count/2)))
    most_common_uniques["news"] = set(x[0] for x in unique_freq_dists["news"].most_common(int(most_common_count/2)))

        
        
    print(f'In joke: \n{most_common_uniques["jokes"] - most_common_uniques["news"] - most_common_uniques["google_qa"]}')
    print()
    print(f'In non joke: \n{most_common_uniques["news"] | most_common_uniques["google_qa"] - most_common_uniques["jokes"]}')
    


# This number can be changed
most_common_count = 200
print(f'Tokens that appear among the {most_common_count} most frequent words in dataset, but not among the top {most_common_count} of other 2 datasets.\n')
token_differences(freq_dists, unique_freq_dists, most_common_count)

Tokens that appear among the 200 most frequent words in dataset, but not among the top 200 of other 2 datasets.

In joke: 
{'look', 'again', 'every', 'right', 'me', 'give', 'guy', 'three', 'bar', 'put', 'few', 'sex', 'think', 'hear', 'old', 'just', 'girl', 'well', 'sure', 'she', 'find', 'tell', "'ve", 'going', 'should', '...', 'too', 'any', 'today', 'asks', 'another', 'friend', 't', 'thing', 'still', 'know', 'like', 'need', 'life', 'thought', 'little', 'turn', "'ll", 'now', 'second', 'walk', 'ca', 'good', 'want', 'hand', 'so', 'make', 'replied', 'very', 'see', 'go', 'man', 'wife', 'later', 'my', 'her', 'here', 'then', '*', 'much', 'joke', 'next', '!', 'because', 'call', 'night', 'went', "n't", 'difference', 'really', 'oh', 'start', 'way', 'around', 'woman', 'long', 'take', 'say', 'got', 'before', 'once', 'am', 'even', 'asked', 'ever', 'get', 'you', 'why', 'off', "''", 'down', "'re", 'came', 'head', 'let', 'back', 'home', 'him', 'them', 'always', "'m", 'your', 'reply', 'work', 'never'}


# Dataset specific statistics

# Jokes analysis
Requires the raw jokes file("reddit_jokes.json.xz"), that has separate columns for title, body and joke.

In [10]:
# Can be used for printing some metrics for jokes
def jokes_stats(df):
    # Counts of rows, that match certain conditions
    df_len = len(df)
    edited = len(df[df['title'].str.contains("edit") | df['body'].str.contains("edit")])
    body_contains_title = len(df[df["title"].str[:10] == df["body"].str[:10]])
    unique_body = len(df["body"].unique())
    unique_title = len(df["title"].unique())
    unique_joke = len(df["joke"].unique())
    
    
    print(f'Total jokes: {df_len:,}')
    print(f'Title in body: {body_contains_title:,} ({body_contains_title/df_len*100:.2f}%)')
    print(f'Edited: {edited:,} ({edited/df_len*100:.2f}%)')
    print(f'Unique body: {unique_body:,} ({unique_body/df_len*100:.2f})%')
    print(f'Unique title: {unique_title:,} ({unique_title/df_len*100:.2f}%)')
    print(f'Unique joke(title+body): {unique_joke:,} ({unique_joke/df_len*100:.2f}%)')
    
    print()
    print(f'Avg chars per entry: {df["joke"].apply(len).mean()}')

# Cant find the correct file, might have deleted id. In any case, this was mostly important in cleaning jokes.
# jokes_stats(pd.read_json(""))

### News

In [11]:
def print_examples(file, column):
    df = pd.read_json(f'data/stats_files/tokenized_lemmatized_{file}.json.xz')
    i = 0
    for text in df["text"]:
        print(text)
        print("\n----------------------------------------------------------\n")
        if i > 50:
            break
        i+=1

#print_examples(file="news", column="text")

In [None]:
def temp():
    df = pd.read_json("data/for_team/jokes.json.xz")
    print(df[["text"]])
    
temp()