# Exploring the statistics of jokes and none-jokes

In [1]:
# Data storage
import json
import pandas as pd
import numpy as np

# Language processing
import nltk
from nltk import FreqDist

#Other
import random
import gc

In [2]:
# General stats about the count of words and sentences
def print_tokenization_stats(df):
    print(f'Text instances: {df.shape[0]:,}\n')
    
    word_lengths_series = df["word_tokenize"].apply(len)
    print(f'Words total: {word_lengths_series.sum():,}')
    print(f'Words mean: {word_lengths_series.mean():.2f}')
    print(f'Words std: {word_lengths_series.std():.2f}')
    print()
      
    sentence_lengths_series = df["sent_tokenize"].apply(len)
    print(f'Sentences total: {sentence_lengths_series.sum():,}')
    print(f'Sentences mean: {sentence_lengths_series.mean():.2f}')
    print(f'Sentences std: {sentence_lengths_series.std():.2f}')
    print()
    

# Creates and returns FreqDist's for use in later cells
def word_analysis(df):
    freq_dist = FreqDist()
    unique_freq_dist = FreqDist()
    
    for tokens in df["word_tokenize"]:

        freq_dist.update(tokens)
        unique_freq_dist.update(set(tokens))
        
    print("General frequencies:")
    freq_dist.pprint(20)
    print("\nUnique frequencies(each token counted only once per text instance):")
    unique_freq_dist.pprint(20)
    
    return freq_dist, unique_freq_dist


# Combines the functions above
def analyze_file(filename):
    df = pd.read_json(filename)
    
    print_tokenization_stats(df)
    freq_dist, unique_freq_dist = word_analysis(df)
    
    return freq_dist, unique_freq_dist

### Reading data from tokenized files

In [3]:
files = ["jokes", "news", "google_qa"]
freq_dists = []
unique_freq_dists = []

for f in files:
    print(f)
    freq_dist, unique_freq_dist = analyze_file(f'data/stats_files/tokenized_{f}.json.xz')
    freq_dists.append(freq_dist)
    unique_freq_dists.append(unique_freq_dist)
    print("\n--------------------------------------------------\n")

jokes
Text instances: 962,887

Words total: 50,929,587
Words mean: 52.89
Words std: 114.07

Sentences total: 3,584,434
Sentences mean: 3.72
Sentences std: 7.52

General frequencies:
FreqDist({'.': 2188302, ',': 1965801, 'the': 1917239, 'a': 1289612, 'and': 1050540, 'to': 994668, 'I': 938192, "''": 753988, '?': 736444, '``': 686423, 'you': 607805, 'of': 516864, 'in': 483904, 'The': 442109, 'he': 400837, 'it': 398768, '!': 393848, 'is': 387391, "'s": 354145, 'was': 342901, ...})

Unique frequencies(each token counted only once per text instance):
FreqDist({'.': 641614, 'a': 538387, '?': 518458, 'the': 510573, 'I': 382737, ',': 382527, 'to': 381310, 'and': 336873, 'you': 320106, 'in': 267152, 'of': 264362, 'it': 229037, 'What': 226421, 'is': 225860, "'s": 219446, 'do': 212499, "''": 196765, '``': 191940, 'The': 183609, 'was': 181501, ...})

--------------------------------------------------

news
Text instances: 300,000

Words total: 42,018,000
Words mean: 140.06
Words std: 92.44

Sentenc

## Differences in most frequent words
Unique nltk.word_tokenize() tokens among the {nr} most common tokens in each dataset.

In [4]:
# most_common_count - Number of most common words taken from each dataset
def token_differences(freq_dists, unique_freq_dists, most_common_count = 100):
    most_common_uniques = []
    
    for dist in unique_freq_dists:
        uniques = set(x[0] for x in dist.most_common(most_common_count))
        most_common_uniques.append(uniques)
        
        
    print(f'In joke but not in others: \n{most_common_uniques[0] - most_common_uniques[1] - most_common_uniques[2]}')
    print()
    print(f'In news but not in others: \n{most_common_uniques[1] - most_common_uniques[0] - most_common_uniques[2]}')
    print()
    print(f'In qa but not in others: \n{most_common_uniques[2] - most_common_uniques[0] - most_common_uniques[1]}')


# This number can be changed
most_common_count = 500
print(f'Tokens that appear among the {most_common_count} most frequent words in dataset, but not among the top {most_common_count} of other 2 datasets.\n')
token_differences(freq_dists, unique_freq_dists, most_common_count)

Tokens that appear among the 500 most frequent words in dataset, but not among the top 500 of other 2 datasets.

In joke but not in others: 
{'every', 'fuck', 'Two', 'thinks', 'favorite', 'Do', 'jokes', 'women', 'looked', 'thought', 'says', 'talk', 'sitting', 'turned', 'shit', 'na', 'pretty', 'room', 'anything', 'Then', '[', 'believe', 'enough', 'everything', 'takes', 'bar', 'lot', 'married', 'Yes', 'beer', 'hear', 'gon', '....', 'decided', 'legs', 'tell', 'walk', 'moment', 'something', 'heard', 'friends', 'beautiful', 'done', "'ll", 'behind', 'wrong', 'mom', 'ass', 'Now', 'light', 'died', '..', 'please', 'Just', 'tells', 'never', 'starts', 'hell', 'trying', 'buy', 'car', 'nothing', 'God', 'An', "'m", 'went', 'guy', 'driving', 'ever', 'saw', 'One', 'am', 'dad', 'everyone', 'keep', 'gets', 'turn', 'walks', 'happened', 'difference', 'bit', 'guess', 'asks', 'eyes', "'ve", 'problem', 'morning', 'fucking', 'minutes', "'re", 'face', 'lady', 'woman', 'So', 'cross', 'sorry', 'funny', 'amp', 's

# Dataset specific statistics

### News

In [5]:
# Prints first 20 news articles
def news_lookat():
    news = pd.read_json(f'data/stats_files/tokenized_news.json.xz')
    i = 0
    for text in news.text:
        print(text)
        print("\n----------------------------------------------------------\n")
        if i > 20:
            break
        i+=1

#news_lookat()