In [1]:
import json
import pandas as pd
import re
from pathlib import Path

In [2]:
pd.set_option('max_colwidth', 800)

In [3]:
def standardize_characters(text):
    """
    Standardizes characters
    """
    text = re.sub('[\n\t\r]|(&nbsp;)', ' ', text)
    text = re.sub('[˗‐‑‒–—―─ー−]', '-', text)
    text = re.sub('[«»”“„″ˮ]', '"', text)
    text = re.sub('[`´ʻ‘’′]', "'", text)
    text = re.sub('[¸‚]', ',', text)
    text = re.sub('…', '...', text)
    text = re.sub('\.{3,}\s?\.{1,}', '... ', text)
    return text

In [4]:
stupid = pd.read_json('../data/stupidstuff.json', orient='records')

In [5]:
stupid.head(3)

Unnamed: 0,body,category,id,rating
0,"A blackjack dealer and a player with a thirteen count in his hand\nwere arguing about whether or not it was appropriate to tip the\ndealer.\n\nThe player said, ""When I get bad cards, it's not the dealer's fault.\nAccordingly, when I get good cards, the dealer obviously had nothing\nto do with it so, why should I tip him?""\n\nThe dealer said, ""When you eat out do you tip the waiter?""\n\n""Yes.""\n\n""Well then, he serves you food, I'm serving you cards, so you should\ntip me.""\n\n""Okay, but, the waiter gives me what I ask for. I'll take an eight.""",Children,1,2.63
1,"At a dinner party, several of the guests were arguing whether men or women were more trustworthy. 'No woman,' said one man, scornfully, 'can keep a secret.' 'I don't know about that,' answered a blonde woman guest. 'I have kept my age a secret since I was twenty-one.' 'You'll let it out some day,' the man insisted. 'I hardly think so!' responded the blonde lady. 'When a woman has kept a secret for twenty-seven years, she can keep it forever.'",Blonde Jokes,2,2.57
2,One day this cop pulls over a blonde for speeding. The cop gets out of his car and asks the blonde for her license.''You cops should get it together. One day you take away my license and the next day you ask me to show it.'',Blonde Jokes,3,3.09


In [6]:
dup = stupid[stupid.duplicated(subset='body')]

In [7]:
dup.shape

(650, 4)

In [8]:
stupid = stupid.drop(dup.index)

In [9]:
stupid[stupid['body'] == '']

Unnamed: 0,body,category,id,rating
1728,,Science,1729,4.5


In [10]:
stupid = stupid.drop(1728)

In [11]:
stupid.shape

(3122, 4)

In [12]:
%%time
stupid['text'] = stupid.body.apply(standardize_characters)

Wall time: 195 ms


In [13]:
stupid.query('rating < 2').sample(2)

Unnamed: 0,body,category,id,rating,text
619,"When I went to lunch today, I noticed an old lady sitting on a park bench sobbing her eyes out. I stopped and asked her what was wrong. She said, ""I have a 22 year old husband at home. He makes love to me every morning and then gets up and makes me pancakes, sausage, fresh fruit and freshly ground coffee."" I said, ""Well, then why are you crying?"" She said, ""He makes me homemade soup for lunch and my favorite brownies and then makes love to me for half the afternoon"". I said, ""Well, why are you crying?"" She said, ""For dinner he makes me a gourmet meal with wine and my favorite dessert and then makes love to me until 2:00 a.m. I said, ""Well, why in the world would you be crying?"" She said, ""I can't remember where I live!"".",Women,620,1.33,"When I went to lunch today, I noticed an old lady sitting on a park bench sobbing her eyes out. I stopped and asked her what was wrong. She said, ""I have a 22 year old husband at home. He makes love to me every morning and then gets up and makes me pancakes, sausage, fresh fruit and freshly ground coffee."" I said, ""Well, then why are you crying?"" She said, ""He makes me homemade soup for lunch and my favorite brownies and then makes love to me for half the afternoon"". I said, ""Well, why are you crying?"" She said, ""For dinner he makes me a gourmet meal with wine and my favorite dessert and then makes love to me until 2:00 a.m. I said, ""Well, why in the world would you be crying?"" She said, ""I can't remember where I live!""."
2024,"A man walks into a store and he saw a thermos. The clerk walks up to him and asks, ""May I help you with anything?"" ""Yea! What is that?"" ""Why that's a thermos!""""What's it do?""""It keeps things hot and it keeps things cold!""""I'll take it"" The next day the man goes to work carrying this thermos. his co-workers ask him ""What's that!""""It's a thermos""""What's it do?""""It keeps things hot and it keeps things cold!""""So whatcha got in it?""""Two ice creams and a cup of coffee.""",Miscellaneous,2025,1.5,"A man walks into a store and he saw a thermos. The clerk walks up to him and asks, ""May I help you with anything?"" ""Yea! What is that?"" ""Why that's a thermos!""""What's it do?""""It keeps things hot and it keeps things cold!""""I'll take it"" The next day the man goes to work carrying this thermos. his co-workers ask him ""What's that!""""It's a thermos""""What's it do?""""It keeps things hot and it keeps things cold!""""So whatcha got in it?""""Two ice creams and a cup of coffee."""


In [14]:
stupid = stupid.rename({'rating': 'score'}, axis=1)

In [15]:
stupid.category.value_counts()

Miscellaneous      701
Men                190
Insults            186
Women              143
Yo Mama            141
Light Bulbs        119
Religious          115
Political          112
Blonde Jokes       111
Heaven and Hell     85
Family, Parents     80
Medical             77
Money               77
Animals             74
Bar Jokes           66
Children            65
Computers           64
Police Jokes        55
Sex                 54
Lawyers             50
Love & Romance      48
Military            47
Crazy Jokes         43
Business            41
Marriage            37
Sports              37
Aviation            35
Holidays            32
Idiots              31
Farmers             27
Redneck             27
School              26
Old Age             22
Office Jokes        18
Science             17
Deep Thoughts       14
Food Jokes          13
Blind Jokes         11
State Jokes          9
Ethnic Jokes         8
Music                7
One Liners           6
English              1
Name: categ

In [16]:
stupid['source'] = 'stupidstuff'

In [18]:
wocka = pd.read_json('../data/wocka.json', orient='records')

In [19]:
wocka.head()

Unnamed: 0,body,category,id,title
0,What do you call a cow with no legs?\r\n\r\nGround Beef!,Animal,1,Cow With No Legs
1,What do you call a cow jumping over a barbed wire fence?\r\n\r\nUtter destruction.,Animal,2,Jumping Cow
2,What's black and white and red all over?\r\n\r\nA newspaper.,Other / Misc,4,"Black, White and Red"
3,"So, this guy walks into a bar.\r\n\r\nAnd says, ""ouch"".",Bar,5,Guy in a Bar
4,"If the opposite of pro is con, isn't the opposite of progress, congress?",One Liners,6,Progress


In [20]:
dup = wocka[wocka.duplicated(subset='body')]

In [21]:
dup.shape

(8, 4)

In [22]:
wocka = wocka.drop(dup.index)

In [23]:
%%time
wocka['text'] = wocka.body.apply(standardize_characters)

Wall time: 700 ms


In [24]:
wocka.shape

(10011, 5)

In [25]:
wocka.sample(2)

Unnamed: 0,body,category,id,title,text
1228,"A Mexican, an Asian, an African, and an American all somehow come across a genie at the same time. They rub the lamp and the genie pops out and agrees to grant them all one wish each. The genie turns to the Mexican and asks what he wished for.\r\n\r\n""I wish that myself and all of my people could return to Mexico and live without poverty and wars."" \r\n\r\nWith that, the Mexican disappeared. The genie then turns to the Asian and asks what he wished for.\r\n\r\n""I wish that myself and all my people could go return to our countries in Asia and live in peace and happiness.""\r\n\r\nAnd the Asian disappears. The genie next asked the African what he would like.\r\n\r\n""I wish that myself and all of my African Brothers and Sisters could return to our homelands in Africa and live in wealth...",Other / Misc,1644,Thirsty?,"A Mexican, an Asian, an African, and an American all somehow come across a genie at the same time. They rub the lamp and the genie pops out and agrees to grant them all one wish each. The genie turns to the Mexican and asks what he wished for. ""I wish that myself and all of my people could return to Mexico and live without poverty and wars."" With that, the Mexican disappeared. The genie then turns to the Asian and asks what he wished for. ""I wish that myself and all my people could go return to our countries in Asia and live in peace and happiness."" And the Asian disappears. The genie next asked the African what he would like. ""I wish that myself and all of my African Brothers and Sisters could return to our homelands in Africa and live in wealth and be bothered no ..."
2385,"Somewhere in the deep South, Bubba called an attorney and asked, ""Is it true they're suing the cigarette companies for causing people to get cancer?""\r\n\r\n""Yes, Bubba, that is true.""\r\n\r\n""And people are suing the fast food restaurants for making them fat and clogging their arteries with all them burgers and fries ... is that true, mister lawyer?""\r\n\r\n""Sure is Bubba, but why do you ask?""\r\n\r\n""Cause I was thinkin' .... maybe I can sue Budweiser for all them ugly women I've been wakin' up with!""",Redneck,3813,Somewhere in the Deep South...,"Somewhere in the deep South, Bubba called an attorney and asked, ""Is it true they're suing the cigarette companies for causing people to get cancer?"" ""Yes, Bubba, that is true."" ""And people are suing the fast food restaurants for making them fat and clogging their arteries with all them burgers and fries ... is that true, mister lawyer?"" ""Sure is Bubba, but why do you ask?"" ""Cause I was thinkin' ... maybe I can sue Budweiser for all them ugly women I've been wakin' up with!"""


In [26]:
wocka.category.value_counts()

Other / Misc       2302
Men / Women         924
One Liners          917
Animal              656
Children            605
Yo Momma            600
Blond               598
Puns                457
Religious           401
At Work             288
News / Politics     279
Insults             276
Gross               253
Redneck             240
Medical             207
Knock-Knock         167
Lawyer              157
Bar                 154
Tech                151
Sports              134
College             131
Lightbulb           110
Yo Mama               3
Blonde                1
Name: category, dtype: int64

In [27]:
wocka['source'] = 'wocka'

In [28]:
# reorder columns
stupid = stupid[['text', 'category', 'source', 'score']]
wocka = wocka[['text', 'category', 'source']]

In [29]:
# combine df-s
combined = pd.concat([stupid, wocka])

In [30]:
combined.shape

(13133, 4)

In [31]:
combined.head(3)

Unnamed: 0,text,category,source,score
0,"A blackjack dealer and a player with a thirteen count in his hand were arguing about whether or not it was appropriate to tip the dealer. The player said, ""When I get bad cards, it's not the dealer's fault. Accordingly, when I get good cards, the dealer obviously had nothing to do with it so, why should I tip him?"" The dealer said, ""When you eat out do you tip the waiter?"" ""Yes."" ""Well then, he serves you food, I'm serving you cards, so you should tip me."" ""Okay, but, the waiter gives me what I ask for. I'll take an eight.""",Children,stupidstuff,2.63
1,"At a dinner party, several of the guests were arguing whether men or women were more trustworthy. 'No woman,' said one man, scornfully, 'can keep a secret.' 'I don't know about that,' answered a blonde woman guest. 'I have kept my age a secret since I was twenty-one.' 'You'll let it out some day,' the man insisted. 'I hardly think so!' responded the blonde lady. 'When a woman has kept a secret for twenty-seven years, she can keep it forever.'",Blonde Jokes,stupidstuff,2.57
2,One day this cop pulls over a blonde for speeding. The cop gets out of his car and asks the blonde for her license.''You cops should get it together. One day you take away my license and the next day you ask me to show it.'',Blonde Jokes,stupidstuff,3.09


In [32]:
combined.to_csv('../data/jokes_stupid_wocka.csv', index=False, encoding='utf-8')

In [35]:
combined['category'].unique().tolist()

['Children',
 'Blonde Jokes',
 'Military',
 'Office Jokes',
 'Aviation',
 'Political',
 'Deep Thoughts',
 'Men',
 'Crazy Jokes',
 'Medical',
 'Food Jokes',
 'Bar Jokes',
 'Science',
 'Police Jokes',
 'Miscellaneous',
 'Sex',
 'Idiots',
 'Business',
 'Women',
 'Redneck',
 'One Liners',
 'Money',
 'School',
 'Family, Parents',
 'Sports',
 'Heaven and Hell',
 'Religious',
 'Farmers',
 'Love & Romance',
 'Blind Jokes',
 'Marriage',
 'Old Age',
 'Animals',
 'Holidays',
 'Ethnic Jokes',
 'State Jokes',
 'English',
 'Computers',
 'Lawyers',
 'Yo Mama',
 'Insults',
 'Light Bulbs',
 'Music',
 'Animal',
 'Other / Misc',
 'Bar',
 'Puns',
 'Lawyer',
 'News / Politics',
 'Men / Women',
 'Gross',
 'Blond',
 'Yo Momma',
 'At Work',
 'College',
 'Lightbulb',
 'Knock-Knock',
 'Tech',
 'Blonde']