In [13]:
import pandas as pd
from IPython.display import display, HTML

# load all sentences that occured 8 times or more
sentences_df = pd.read_csv('data/BookCorpus/sentences_counted_8+.csv')

# load "bad words"
bad_words = pd.read_csv('https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en',header=None)
bad_words.columns = ['word']

# compute bad word flag
bad_words_list = bad_words.word.tolist()
sentences_df['contains_bad_word'] = sentences_df.sentence.str.contains(' | '.join(bad_words_list))

# compute space count (sentence length)
sentences_df['space_count'] = sentences_df.sentence.str.count(' ')
sentences_df = sentences_df[['sentence','n','contains_bad_word','space_count']]

# Overview and samples

*Note these are only sentences that occurred 8 or more times, not all sentences in the dataset*

In [14]:
# most common sentences with at least 8 spaces
sentences_df[(sentences_df.space_count>7)].head(10)

Unnamed: 0,sentence,n,contains_bad_word,space_count
494,`` what 's that supposed to mean ? '',1408,False,8
536,thank you for respecting the hard work of this...,1345,False,10
555,this ebook may not be re-sold or given away to...,1302,False,12
625,if you would like to share this book with anot...,1193,False,20
674,`` what are you going to do ? '',1113,False,8
1003,if youre reading this book and did not purchas...,788,False,31
1053,`` what the hell are you doing ? '',760,False,8
1056,`` i do n't know what you 're talking about . '',759,False,11
1121,`` why did n't you tell me ? '',721,False,8
1468,`` what are we going to do ? '',568,False,8


In [15]:
# random sample of common sentences with at least 7 spaces
sentences_df[(sentences_df.space_count>7)].sample(10).sort_values(by='n',ascending=False)

Unnamed: 0,sentence,n,contains_bad_word,space_count
52530,"`` yeah , i 'll be fine . ''",27,False,8
101240,you do n't have to worry about that . '',15,False,9
107979,"`` she 's right , you know . ''",14,False,8
148083,the hair rose on the back of my neck .,11,False,9
200132,"`` i 'll go with you , '' he said .",9,False,10
189487,`` i 've had a lot of practice . '',9,False,9
204449,but in spite of wielding the knife and assurin...,8,False,35
236182,"i fought for it for an entire century , practi...",8,False,16
225925,"`` i was thinking , '' she said .",8,False,8
232926,he caught me staring at him and smiled .,8,False,8


In [16]:
# random sample of all common sentences
sentences_df.sample(10).sort_values(by='n',ascending=False)

Unnamed: 0,sentence,n,contains_bad_word,space_count
18074,i did n't have to wait long .,67,False,7
22506,"`` no , my lady .",56,False,5
26340,he needed this .,49,False,3
78048,`` what if you 're wrong ?,19,False,6
129448,"`` no , '' she said emphatically .",12,False,7
130603,you just do n't . '',12,False,5
128519,julia rolled her eyes .,12,False,4
175235,`` i 'm so glad you made it . '',10,False,9
178797,you will need your strength .,9,False,5
207958,`` stevie ! '',8,False,3


# Noisy / redundant sentences

In [17]:
# copyright sentences
sentences_df[sentences_df.sentence.str.contains('copyright|smashwords')].head(10)

Unnamed: 0,sentence,n,contains_bad_word,space_count
1003,if youre reading this book and did not purchas...,788,False,31
9306,"if you enjoyed this book , please encourage yo...",119,False,28
10068,this book remains the copyrighted property of ...,111,False,22
10340,"although this is a free book , it remains the ...",109,False,30
12408,"smashwords edition , license notes this ebook ...",93,False,14
15967,"if you enjoyed this book , please return to sm...",75,False,17
16523,if you 're reading this book and did not purch...,73,False,32
16902,if you 're reading this book and did not purch...,72,False,33
23623,if youre reading this book and did not purchas...,54,False,32
26176,without limiting the rights under copyright re...,49,False,67


In [18]:
# sentences about "the author"
sentences_df[sentences_df.sentence.str.contains('other books')].head(10)

Unnamed: 0,sentence,n,contains_bad_word,space_count
153515,"end if you enjoyed reading this book , you mig...",11,False,29


# Lite gender analysis

In [19]:
# most common sentences containing "he"
sentences_df[(sentences_df.space_count>7)&(sentences_df.sentence.str.contains(' he '))].head(10)

Unnamed: 0,sentence,n,contains_bad_word,space_count
1503,"`` i 'm sorry , '' he said .",555,False,8
2657,"`` i do n't know , '' he said .",343,False,9
5570,"`` i love you , '' he said .",184,False,8
8543,`` what did he do to you ? '',127,False,8
8626,"`` do n't worry , '' he said .",126,False,8
9043,"`` i love you , '' he whispered .",121,False,8
9157,`` do you know where he is ? '',120,False,8
9666,"`` you 're right , '' he said .",115,False,8
10562,"`` i 'm sorry , '' he whispered .",107,False,8
11811,"`` it 's okay , '' he said .",97,False,8


In [20]:
# most common sentences containing "she"
sentences_df[(sentences_df.space_count>7)&(sentences_df.sentence.str.contains(' she '))].head(10)

Unnamed: 0,sentence,n,contains_bad_word,space_count
1996,"`` i 'm sorry , '' she said .",443,False,8
3041,"`` i do n't know , '' she said .",306,False,9
4265,"`` i 'm sorry , '' she whispered .",231,False,8
8041,"`` i love you , '' she whispered .",134,False,8
9077,"`` i love you , '' she said .",121,False,8
9415,"`` you 're right , '' she said .",117,False,8
9780,"`` i 'm fine , '' she said .",114,False,8
9978,"`` it 's okay , '' she said .",112,False,8
13399,"`` i ca n't , '' she said .",87,False,8
15938,"`` thank you , '' she said softly .",75,False,8


# Lite "bad word" analysis

In [21]:
# most common sentences with "bad words"
sentences_df[(sentences_df.contains_bad_word)&(sentences_df.space_count>4)].head(10)

Unnamed: 0,sentence,n,contains_bad_word,space_count
2340,`` what the fuck ? '',383,True,5
3809,`` son of a bitch ! '',256,True,6
3908,"`` oh , shit . ''",250,True,5
4472,"shit , shit , shit .",222,True,5
5560,`` son of a bitch . '',184,True,6
7108,"`` oh , shit ! ''",149,True,5
7794,`` you son of a bitch ! '',137,True,7
10298,"`` shit , '' i said .",109,True,6
11304,i suck in a breath .,101,True,5
11649,"shit , shit , shit !",98,True,5


In [22]:
# random sentences with "bad words"
sentences_df[(sentences_df.contains_bad_word)].sample(10).sort_values(by='n',ascending=False)

Unnamed: 0,sentence,n,contains_bad_word,space_count
11944,"`` shit , '' he said .",96,True,6
31631,`` fuck yes . '',42,True,4
36781,very sexy .,37,True,2
47622,`` you 're such an ass . '',29,True,7
118326,you 're sexy .,13,True,3
154854,`` sneaky bastard . '',11,True,4
142548,"well , fuck me .",11,True,4
157734,i want to fuck you .,10,True,5
175546,`` what the fuck 's going on ? '',10,True,8
172264,just fuck me .,10,True,3


In [23]:
# common longer sentences with "bad words"
sentences_df[(sentences_df.contains_bad_word) & (sentences_df.space_count>7)].head(10)

Unnamed: 0,sentence,n,contains_bad_word,space_count
24378,`` what the fuck are you talking about ? '',52,True,9
25302,`` what the fuck are you doing ? '',51,True,8
40642,`` you scared the shit out of me . '',34,True,9
46609,`` you 're so full of shit . '',30,True,8
52958,`` you are so full of shit . '',27,True,8
54045,`` what the fuck do you think you 're doing ? '',26,True,11
55438,`` what the fuck are you doing here ? '',26,True,9
56448,you scared the shit out of me ! '',25,True,8
64022,`` what the fuck is going on ? '',23,True,8
64052,you scared the shit out of me . '',23,True,8
