In [7]:
import pandas as pd
import nltk
nltk.download('cmudict')
from nltk.corpus import cmudict
from profanity_check import predict, predict_prob

[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/olafwisselink/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [8]:
def num_syllables_word(word):
    try:
        # from: https://datascience.stackexchange.com/questions/23376/how-to-get-the-number-of-syllables-in-a-word
        return [len(list(y for y in x if y[-1].isdigit())) for x in cmu_dict[word.lower()]][0]
    except:
        # print('error, word {} not in cmudict'.format(word))
        words_not_in_cmu.append(word)
        return -1

def has_correct_syllables(haiku):
    for i, sentence in enumerate(haiku):
        num_syllabes = 0
        try:
            for word in sentence.split():
                num_s =  num_syllables_word(word)
                if num_s != -1:
                    num_syllabes += num_syllables_word(word)
                else:
                    return False
            if i == 0 or i == 2:
                if num_syllabes != 5:
                    return False
            elif i == 1:
                if num_syllabes != 7:
                    return False
                    break
        except:
            return False
    return True

##### Read and combine datasets into dataframe

In [9]:
filepath = '../data/'
kaggle_jhalini = 'kaggle_jhalini_haiku.csv'
kaggle_bfbarry = 'kaggle_bfbarry_haiku.txt'

# read jhalini kaggle dataset
df1 = pd.read_csv(filepath+kaggle_jhalini)
df1.drop(columns=df1.columns[[0, -1]], inplace=True) # Drop first (unnamed) and last (hash) column

# read bfbarry kaggle dataset
df2 = pd.read_csv(filepath+kaggle_bfbarry, delimiter='/', names=['0', '1', '2'])
df2.replace('^ +| +$|\'$|\$', '', regex=True, inplace=True)
df2['source'] = 'kaggle_bfbarry'
filename = kaggle_bfbarry.split('.')[0] + '.csv'
df2.to_csv(filepath+filename, index=None)

# concatenate datasets and remove duplicates
df = pd.concat([df1, df2])
df = df.drop_duplicates().reset_index(drop=True)

##### Preprocess data
- remove misc characters
- make lowercase
- keep only 5-7-5 syllable structure
- remove profanity 

In [10]:
cmu_dict = cmudict.dict()
words_not_in_cmu = []

# Only use n rows for speeding up testing (all = df.shape[0])
nrows = df.shape[0]
df.drop(index=df.index[nrows:], inplace=True) 

# Remove misc characters (needs work, see words_not_in_cmu below)
df.replace('-{2,}|—|-$|- |~|"|\.|;|^ +| +$|\'$', '', regex=True, inplace=True) 

# Make haikus lowercase
for columns in df[['0','1','2']]:
    df[columns] = df[columns].str.lower()

# Remove haikus that do not follow 5-7-5 syllable structure
all_haikus = df[['0','1','2']].to_numpy()
df = df[[has_correct_syllables(haiku) for haiku in all_haikus]]

# Remove haikus containing profane language
all_haikus = df[['0','1','2']].to_numpy()
profane_ids = [any(predict_prob(h) > 0.75) for h in all_haikus]
profane_haikus = df[profane_ids]
df = df[[not profane for profane in profane_ids]]
profane_haikus.to_csv(filepath+'profane_haikus.csv')

df.reset_index(inplace=True, drop=True)
df.to_csv(filepath+'filtered_haikus.csv')

print('Total haikus: {}'.format(nrows))
print('Filtered Haikus: {} ({}%)\n'.format(df.shape[0], round(df.shape[0] / nrows * 100, 2)))

Total haikus: 155354
Filtered Haikus: 98914 (63.67%)



In [11]:
words_not_in_cmu

['tarn',
 'rhododendrons',
 'wisteria',
 '–',
 'father’s',
 'summer’s',
 'unknotting',
 "gerfalcon's",
 'no-name',
 'rouging',
 'shadows…',
 'women’s',
 'rereads',
 'starless',
 'mum’s',
 'cityscape',
 'windless',
 'winterscape',
 '20',
 'pan-fried',
 'watchtower',
 "convict's",
 'pissing',
 'sunbathes',
 'mother’s',
 'moonless',
 'fogged',
 'windless',
 'all-day',
 'alep',
 'greyness',
 'smartphone',
 'cherrytree',
 'ex-girlfriend',
 'haïku',
 '5',
 'rear-view',
 'blue,',
 'fly,',
 '72nd',
 'month-old',
 '63',
 '60',
 '70',
 'moonlessness',
 'loving-making',
 'carbuncles',
 'skys',
 'cheeries',
 'lyrid',
 'stock-still',
 'neighbor’s',
 'roshi’s',
 'typos',
 'typos',
 'dog’s',
 'neighbor’s',
 "seniors'",
 "dragonfly's",
 'pikelets',
 'croton',
 'rouses',
 "tide's",
 'chamomile',
 '!',
 "hueso's",
 'old,',
 "childhood's",
 'slight,',
 'uneaten',
 'yellowed',
 'bugler',
 '=',
 'crisium',
 'rivulets',
 "gi's",
 'swale',
 'gleaners',
 'glassy-eyed,',
 'miso!',
 'saved,',
 'feathers,',
 '–'