In [73]:
import json
import pandas as pd
import re
from pathlib import Path
import io
import requests

In [74]:
pd.set_option('max_colwidth', 800)

In [75]:
def standardize_characters(text):
    """
    Standardizes characters
    """
    text = re.sub('[\n\t\r]|(&nbsp;)', ' ', text)
    text = re.sub('[˗‐‑‒–—―─ー−]', '-', text)
    text = re.sub('[«»”“„″ˮ]', '"', text)
    text = re.sub('[`´ʻ‘’′]', "'", text)
    text = re.sub('[¸‚]', ',', text)
    text = re.sub('…', '...', text)
    text = re.sub('\.{3,}\s?\.{1,}', '... ', text)
    return text

## Stupid dataset
##### https://github.com/taivop/joke-dataset

In [76]:
url = "https://raw.githubusercontent.com/taivop/joke-dataset/master/stupidstuff.json"
url_content = requests.get(url).content
stupid = pd.read_json(url_content, orient='records')

In [77]:
stupid.head(3)

Unnamed: 0,body,category,id,rating
0,"A blackjack dealer and a player with a thirteen count in his hand\nwere arguing about whether or not it was appropriate to tip the\ndealer.\n\nThe player said, ""When I get bad cards, it's not the dealer's fault.\nAccordingly, when I get good cards, the dealer obviously had nothing\nto do with it so, why should I tip him?""\n\nThe dealer said, ""When you eat out do you tip the waiter?""\n\n""Yes.""\n\n""Well then, he serves you food, I'm serving you cards, so you should\ntip me.""\n\n""Okay, but, the waiter gives me what I ask for. I'll take an eight.""",Children,1,2.63
1,"At a dinner party, several of the guests were arguing whether men or women were more trustworthy. 'No woman,' said one man, scornfully, 'can keep a secret.' 'I don't know about that,' answered a blonde woman guest. 'I have kept my age a secret since I was twenty-one.' 'You'll let it out some day,' the man insisted. 'I hardly think so!' responded the blonde lady. 'When a woman has kept a secret for twenty-seven years, she can keep it forever.'",Blonde Jokes,2,2.57
2,One day this cop pulls over a blonde for speeding. The cop gets out of his car and asks the blonde for her license.''You cops should get it together. One day you take away my license and the next day you ask me to show it.'',Blonde Jokes,3,3.09


In [78]:
stupid.shape

(3773, 4)

In [79]:
dup = stupid[stupid.duplicated(subset='body')]

In [80]:
dup.shape

(650, 4)

In [81]:
stupid = stupid.drop(dup.index)

In [82]:
stupid[stupid['body'] == '']

Unnamed: 0,body,category,id,rating
1728,,Science,1729,4.5


In [83]:
stupid = stupid.drop(1728)

In [108]:
stupid.shape

(3122, 4)

In [85]:
%%time
stupid['text'] = stupid.body.apply(standardize_characters)

CPU times: user 166 ms, sys: 2.54 ms, total: 168 ms
Wall time: 167 ms


In [86]:
stupid.query('rating < 2').sample(2)

Unnamed: 0,body,category,id,rating,text
2154,"Some American fishing enthusiasts decided to go an a fishing trip to a really remote lake in northen Canada. A bush pilot flew them in with all their supplies saying he would be back in a week to pick them up. While the fishing was good, after a few days, the men started to itch for female companionship. In fact Bill was so horny he decided to visit a primitive native indian camp down the lake and find a squaw.His friends advised against it as the indians were on their reservation and were protected by the Canadian government. He could risk being arrested by the RCMP, the Royal Canadian Mounted Police.Bill was determined however and set out that evening. Two hours later he came back in terrible shape, all bloody and beaten. ""What happened"" his freinds asked.""FBI got me"" Bill mumbled. ""...",Police Jokes,2155,1.0,"Some American fishing enthusiasts decided to go an a fishing trip to a really remote lake in northen Canada. A bush pilot flew them in with all their supplies saying he would be back in a week to pick them up. While the fishing was good, after a few days, the men started to itch for female companionship. In fact Bill was so horny he decided to visit a primitive native indian camp down the lake and find a squaw.His friends advised against it as the indians were on their reservation and were protected by the Canadian government. He could risk being arrested by the RCMP, the Royal Canadian Mounted Police.Bill was determined however and set out that evening. Two hours later he came back in terrible shape, all bloody and beaten. ""What happened"" his freinds asked.""FBI got me"" Bill mumbled. ""..."
1117,"One day long ago. There was a king who gathered all the people of the land. He said if anyone could swim across the lake (with crocodiles) would have their choice of : a castle, money, or his daughters hand in marriage.\nOne man jumped up and yelled ""I can do it!"" He tried and failed. Another man said ""I can do better than that."" So he jumped in and didn't make it either. Next the crowd heard a splash. A man swam all the way across the lake. The king asked him ""Which one would you like? A castle, money, or my daughters' hand in marriage."" The man replied ""I want the idiot that pushed me in!""",Money,1118,1.0,"One day long ago. There was a king who gathered all the people of the land. He said if anyone could swim across the lake (with crocodiles) would have their choice of : a castle, money, or his daughters hand in marriage. One man jumped up and yelled ""I can do it!"" He tried and failed. Another man said ""I can do better than that."" So he jumped in and didn't make it either. Next the crowd heard a splash. A man swam all the way across the lake. The king asked him ""Which one would you like? A castle, money, or my daughters' hand in marriage."" The man replied ""I want the idiot that pushed me in!"""


In [87]:
stupid = stupid.rename({'rating': 'score'}, axis=1)

In [88]:
stupid.category.value_counts()

Miscellaneous      701
Men                190
Insults            186
Women              143
Yo Mama            141
Light Bulbs        119
Religious          115
Political          112
Blonde Jokes       111
Heaven and Hell     85
Family, Parents     80
Medical             77
Money               77
Animals             74
Bar Jokes           66
Children            65
Computers           64
Police Jokes        55
Sex                 54
Lawyers             50
Love & Romance      48
Military            47
Crazy Jokes         43
Business            41
Sports              37
Marriage            37
Aviation            35
Holidays            32
Idiots              31
Redneck             27
Farmers             27
School              26
Old Age             22
Office Jokes        18
Science             17
Deep Thoughts       14
Food Jokes          13
Blind Jokes         11
State Jokes          9
Ethnic Jokes         8
Music                7
One Liners           6
English              1
Name: categ

In [89]:
stupid['source'] = 'stupidstuff'

## Wocka dataset
##### https://github.com/taivop/joke-dataset

In [90]:
url = "https://raw.githubusercontent.com/taivop/joke-dataset/master/wocka.json"
url_content = requests.get(url).content

wocka = pd.read_json(url_content, orient='records')

In [91]:
wocka.head()

Unnamed: 0,body,category,id,title
0,What do you call a cow with no legs?\r\n\r\nGround Beef!,Animal,1,Cow With No Legs
1,What do you call a cow jumping over a barbed wire fence?\r\n\r\nUtter destruction.,Animal,2,Jumping Cow
2,What's black and white and red all over?\r\n\r\nA newspaper.,Other / Misc,4,"Black, White and Red"
3,"So, this guy walks into a bar.\r\n\r\nAnd says, ""ouch"".",Bar,5,Guy in a Bar
4,"If the opposite of pro is con, isn't the opposite of progress, congress?",One Liners,6,Progress


In [92]:
wocka.shape

(10019, 4)

In [93]:
dup = wocka[wocka.duplicated(subset='body')]

In [94]:
dup.shape

(8, 4)

In [95]:
wocka = wocka.drop(dup.index)

In [96]:
%%time
wocka['text'] = wocka.body.apply(standardize_characters)

CPU times: user 444 ms, sys: 2.71 ms, total: 447 ms
Wall time: 446 ms


In [97]:
wocka.shape

(10011, 5)

In [98]:
wocka.sample(2)

Unnamed: 0,body,category,id,title,text
6882,"Once upon a time, long, long ago there were two unique lions in the jungles of Africa. Both, it seems, had human-like qualities that allowed them to claim territory, daring the other to cross over the line. Strange as it seems, the boundary between their turf became a well traveled trail through the jungle. \r\n\r\nAll day, every day, both lions lay in the brush staring across the trail at their compatriot, daring him to cross into their territory. \r\n\r\nThe local natives knew of this animal feud, but all this was unbeknown to African Jack, a well-known and very publicized guide who did not speak Lionese and was unfamiliar with the territory. \r\n\r\nWhile he was leading a safari through the jungle, the travelers had to walk and cut vines with their machetes, and all this constant ha...",Puns,13071,Two Lions,"Once upon a time, long, long ago there were two unique lions in the jungles of Africa. Both, it seems, had human-like qualities that allowed them to claim territory, daring the other to cross over the line. Strange as it seems, the boundary between their turf became a well traveled trail through the jungle. All day, every day, both lions lay in the brush staring across the trail at their compatriot, daring him to cross into their territory. The local natives knew of this animal feud, but all this was unbeknown to African Jack, a well-known and very publicized guide who did not speak Lionese and was unfamiliar with the territory. While he was leading a safari through the jungle, the travelers had to walk and cut vines with their machetes, and all this constant hacking brush ..."
7980,"A woman calls her boss one morning and tells him that she is staying home because she's not feeling well.\r\n\r\n""What's the matter?"" he asks.\r\n\r\n""I have a case of anal glaucoma,"" she says in a weak voice.\r\n\r\n""What in the hell is anal glaucoma?"" he inquires.\r\n\r\n""Well, I just can't see my ass coming to work today.""",At Work,15038,Calling In Sick,"A woman calls her boss one morning and tells him that she is staying home because she's not feeling well. ""What's the matter?"" he asks. ""I have a case of anal glaucoma,"" she says in a weak voice. ""What in the hell is anal glaucoma?"" he inquires. ""Well, I just can't see my ass coming to work today."""


In [99]:
wocka.category.value_counts()

Other / Misc       2302
Men / Women         924
One Liners          917
Animal              656
Children            605
Yo Momma            600
Blond               598
Puns                457
Religious           401
At Work             288
News / Politics     279
Insults             276
Gross               253
Redneck             240
Medical             207
Knock-Knock         167
Lawyer              157
Bar                 154
Tech                151
Sports              134
College             131
Lightbulb           110
Yo Mama               3
Blonde                1
Name: category, dtype: int64

In [100]:
wocka['source'] = 'wocka'

In [101]:
# reorder columns
stupid = stupid[['text', 'category', 'source', 'score']]
wocka = wocka[['text', 'category', 'source']]

In [102]:
# combine df-s
combined = pd.concat([stupid, wocka])

In [103]:
combined.shape

(13133, 4)

In [104]:
combined.head(3)

Unnamed: 0,text,category,source,score
0,"A blackjack dealer and a player with a thirteen count in his hand were arguing about whether or not it was appropriate to tip the dealer. The player said, ""When I get bad cards, it's not the dealer's fault. Accordingly, when I get good cards, the dealer obviously had nothing to do with it so, why should I tip him?"" The dealer said, ""When you eat out do you tip the waiter?"" ""Yes."" ""Well then, he serves you food, I'm serving you cards, so you should tip me."" ""Okay, but, the waiter gives me what I ask for. I'll take an eight.""",Children,stupidstuff,2.63
1,"At a dinner party, several of the guests were arguing whether men or women were more trustworthy. 'No woman,' said one man, scornfully, 'can keep a secret.' 'I don't know about that,' answered a blonde woman guest. 'I have kept my age a secret since I was twenty-one.' 'You'll let it out some day,' the man insisted. 'I hardly think so!' responded the blonde lady. 'When a woman has kept a secret for twenty-seven years, she can keep it forever.'",Blonde Jokes,stupidstuff,2.57
2,One day this cop pulls over a blonde for speeding. The cop gets out of his car and asks the blonde for her license.''You cops should get it together. One day you take away my license and the next day you ask me to show it.'',Blonde Jokes,stupidstuff,3.09


In [106]:
combined.to_csv('data/jokes_stupid_wocka.csv', index=False, encoding='utf-8')

In [114]:
combined['category'].unique().tolist()

['Children',
 'Blonde Jokes',
 'Military',
 'Office Jokes',
 'Aviation',
 'Political',
 'Deep Thoughts',
 'Men',
 'Crazy Jokes',
 'Medical',
 'Food Jokes',
 'Bar Jokes',
 'Science',
 'Police Jokes',
 'Miscellaneous',
 'Sex',
 'Idiots',
 'Business',
 'Women',
 'Redneck',
 'One Liners',
 'Money',
 'School',
 'Family, Parents',
 'Sports',
 'Heaven and Hell',
 'Religious',
 'Farmers',
 'Love & Romance',
 'Blind Jokes',
 'Marriage',
 'Old Age',
 'Animals',
 'Holidays',
 'Ethnic Jokes',
 'State Jokes',
 'English',
 'Computers',
 'Lawyers',
 'Yo Mama',
 'Insults',
 'Light Bulbs',
 'Music',
 'Animal',
 'Other / Misc',
 'Bar',
 'Puns',
 'Lawyer',
 'News / Politics',
 'Men / Women',
 'Gross',
 'Blond',
 'Yo Momma',
 'At Work',
 'College',
 'Lightbulb',
 'Knock-Knock',
 'Tech',
 'Blonde']

In [115]:
combined.category.value_counts()

Other / Misc       2302
Men / Women         924
One Liners          923
Miscellaneous       701
Children            670
Animal              656
Yo Momma            600
Blond               598
Religious           516
Insults             462
Puns                457
At Work             288
Medical             284
News / Politics     279
Redneck             267
Gross               253
Men                 190
Sports              171
Knock-Knock         167
Lawyer              157
Bar                 154
Tech                151
Yo Mama             144
Women               143
College             131
Light Bulbs         119
Political           112
Blonde Jokes        111
Lightbulb           110
Heaven and Hell      85
Family, Parents      80
Money                77
Animals              74
Bar Jokes            66
Computers            64
Police Jokes         55
Sex                  54
Lawyers              50
Love & Romance       48
Military             47
Crazy Jokes          43
Business        