In [21]:
import pandas as pd
from IPython.display import display, HTML

# load all sentences that occured 8 times or more
sentences_df = pd.read_csv('data/BookCorpus/sentences_counted_8+.csv')

# load "bad words"
bad_words = pd.read_csv('https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en',header=None)
bad_words.columns = ['word']

# compute bad word flag
bad_words_list = bad_words.word.tolist()
sentences_df['contains_bad_word'] = sentences_df.sentence.str.contains(' | '.join(bad_words_list))

# compute space count (sentence length)
sentences_df['space_count'] = sentences_df.sentence.str.count(' ')

# Overview and samples

In [22]:
# most common sentences with at least 8 spaces
sentences_df[(sentences_df.space_count>7)].head(10)

Unnamed: 0.1,Unnamed: 0,sentence,n,contains_bad_word,space_count
494,494,`` what 's that supposed to mean ? '',1408,False,8
536,536,thank you for respecting the hard work of this...,1345,False,10
555,555,this ebook may not be re-sold or given away to...,1302,False,12
625,625,if you would like to share this book with anot...,1193,False,20
674,674,`` what are you going to do ? '',1113,False,8
1003,1003,if youre reading this book and did not purchas...,788,False,31
1053,1053,`` what the hell are you doing ? '',760,False,8
1056,1056,`` i do n't know what you 're talking about . '',759,False,11
1121,1121,`` why did n't you tell me ? '',721,False,8
1468,1468,`` what are we going to do ? '',568,False,8


In [23]:
# a closer look at common copyright sentences
sentences_df[sentences_df.sentence.str.contains('copyright')].head(10)

Unnamed: 0.1,Unnamed: 0,sentence,n,contains_bad_word,space_count
10068,10068,this book remains the copyrighted property of ...,111,False,22
10340,10340,"although this is a free book , it remains the ...",109,False,30
26176,26176,without limiting the rights under copyright re...,49,False,67
66153,66153,please do not participate in or encourage pira...,22,False,17
76446,76446,it remains the copyrighted property of the aut...,19,False,23
81529,81529,"the stories , articles and illustrations conta...",18,False,24
97707,97707,"no part of this publication may be copied , re...",16,False,34
98227,98227,this book contains material protected under in...,16,False,13
121571,121571,"although this is a free book , it remains the ...",13,False,36
122441,122441,"although this is a free book , it remains the ...",13,False,31


In [24]:
# random sample of common sentences with at least 7 spaces
sentences_df[(sentences_df.space_count>7)].sample(10).sort_values(by='n',ascending=False)

Unnamed: 0.1,Unnamed: 0,sentence,n,contains_bad_word,space_count
46662,46662,what the hell am i going to do ?,30,False,8
62161,62161,`` what would you have of me ? '',23,False,8
122132,122132,i was n't sure how i felt about it .,13,False,9
142549,142549,"`` you got it , '' i said .",11,False,8
181383,181383,`` you do n't want me to ? '',9,False,8
225814,225814,`` do you have a better suggestion ? '',8,False,8
231134,231134,"just beneath that silk , however , he thought ...",8,False,15
230675,230675,"`` in my head , sofia , there 's always music ...",8,False,12
219992,219992,"`` does it feel good , baby ? ''",8,False,8
222095,222095,not that he would have had to look far .,8,False,9


In [25]:
# random sample of all common sentences
sentences_df.sample(10).sort_values(by='n',ascending=False)

Unnamed: 0.1,Unnamed: 0,sentence,n,contains_bad_word,space_count
45299,45299,problem .,31,False,1
51151,51151,"`` yeah , i got that .",28,False,6
53261,53261,howie asked .,27,False,2
63403,63403,just keep breathing .,23,False,3
80922,80922,`` that about covers it . '',18,False,6
140459,140459,jeff inquired .,12,False,2
142775,142775,`` groovy . '',11,False,3
207714,207714,she would n't listen .,8,False,4
204858,204858,i felt pity for him .,8,False,5
232035,232035,"just like that , she knew what this little mak...",8,False,13


# Lite gender analysis

In [26]:
# most common sentences containing "he"
sentences_df[(sentences_df.space_count>7)&(sentences_df.sentence.str.contains(' he '))].head(10)

Unnamed: 0.1,Unnamed: 0,sentence,n,contains_bad_word,space_count
1503,1503,"`` i 'm sorry , '' he said .",555,False,8
2657,2657,"`` i do n't know , '' he said .",343,False,9
5570,5570,"`` i love you , '' he said .",184,False,8
8543,8543,`` what did he do to you ? '',127,False,8
8626,8626,"`` do n't worry , '' he said .",126,False,8
9043,9043,"`` i love you , '' he whispered .",121,False,8
9157,9157,`` do you know where he is ? '',120,False,8
9666,9666,"`` you 're right , '' he said .",115,False,8
10562,10562,"`` i 'm sorry , '' he whispered .",107,False,8
11811,11811,"`` it 's okay , '' he said .",97,False,8


In [27]:
# most common sentences containing "she"
sentences_df[(sentences_df.space_count>7)&(sentences_df.sentence.str.contains(' she '))].head(10)

Unnamed: 0.1,Unnamed: 0,sentence,n,contains_bad_word,space_count
1996,1996,"`` i 'm sorry , '' she said .",443,False,8
3041,3041,"`` i do n't know , '' she said .",306,False,9
4265,4265,"`` i 'm sorry , '' she whispered .",231,False,8
8041,8041,"`` i love you , '' she whispered .",134,False,8
9077,9077,"`` i love you , '' she said .",121,False,8
9415,9415,"`` you 're right , '' she said .",117,False,8
9780,9780,"`` i 'm fine , '' she said .",114,False,8
9978,9978,"`` it 's okay , '' she said .",112,False,8
13399,13399,"`` i ca n't , '' she said .",87,False,8
15938,15938,"`` thank you , '' she said softly .",75,False,8


# Lite "bad word" analysis

In [28]:
# most common sentences with "bad words"
sentences_df[(sentences_df.contains_bad_word)&(sentences_df.space_count>4)].head(10)

Unnamed: 0.1,Unnamed: 0,sentence,n,contains_bad_word,space_count
2340,2340,`` what the fuck ? '',383,True,5
3809,3809,`` son of a bitch ! '',256,True,6
3908,3908,"`` oh , shit . ''",250,True,5
4472,4472,"shit , shit , shit .",222,True,5
5560,5560,`` son of a bitch . '',184,True,6
7108,7108,"`` oh , shit ! ''",149,True,5
7794,7794,`` you son of a bitch ! '',137,True,7
10298,10298,"`` shit , '' i said .",109,True,6
11304,11304,i suck in a breath .,101,True,5
11649,11649,"shit , shit , shit !",98,True,5


In [29]:
# random sentences with "bad words"
sentences_df[(sentences_df.contains_bad_word)].sample(10).sort_values(by='n',ascending=False)

Unnamed: 0.1,Unnamed: 0,sentence,n,contains_bad_word,space_count
5306,5306,the bastard .,192,True,2
13283,13283,`` bastard .,88,True,2
19634,19634,holy fuck !,63,True,2
24915,24915,`` you bitch !,51,True,3
27169,27169,`` smart ass . '',48,True,4
157978,157978,`` your cock .,10,True,3
181326,181326,"`` aw , fuck . ''",9,True,5
215952,215952,"`` yeah , and i 'll do it again if you do n't ...",8,True,19
205718,205718,`` piece of shit . '',8,True,5
220311,220311,your ass is mine .,8,True,4


In [30]:
# common longer sentences with "bad words"
sentences_df[(sentences_df.contains_bad_word) & (sentences_df.space_count>7)].head(10)

Unnamed: 0.1,Unnamed: 0,sentence,n,contains_bad_word,space_count
24378,24378,`` what the fuck are you talking about ? '',52,True,9
25302,25302,`` what the fuck are you doing ? '',51,True,8
40642,40642,`` you scared the shit out of me . '',34,True,9
46609,46609,`` you 're so full of shit . '',30,True,8
52958,52958,`` you are so full of shit . '',27,True,8
54045,54045,`` what the fuck do you think you 're doing ? '',26,True,11
55438,55438,`` what the fuck are you doing here ? '',26,True,9
56448,56448,you scared the shit out of me ! '',25,True,8
64022,64022,`` what the fuck is going on ? '',23,True,8
64052,64052,you scared the shit out of me . '',23,True,8
