## Imports & Data Load

In [8]:
# IMPORTS
import re, spacy, textacy
import pandas as pd
from nltk import sent_tokenize

# if needed, run the following in terminal: python3 -m spacy download en_core_web_sm
# Load the Space pipeline to be used
nlp = spacy.load('en_core_web_sm')

In [7]:
# Loading the Data in a gendered partitioned fashion: 
talks_m = pd.read_csv('../output/talks_male.csv', index_col='Talk_ID')
talks_f = pd.read_csv('../output/talks_female.csv', index_col='Talk_ID')
# talks_nog = pd.read_csv('../output/talks_nog.csv', index_col='Talk_ID')
talks_all = pd.concat([talks_m, talks_f, talks_nog])


print(f"From our {talks_all.shape[0]}x{talks_all.shape[1]} CSV, \
we have a list of {len(texts_all)} talks: {len(texts_women)} by women and \
{len(texts_men)} by men.")

From our 992x14 CSV, we have a list of 992 talks: 260 by women and 720 by men.


In [45]:
# Get a list of all the columns
# talks_m.columns.tolist()
drop = ['public_url', 'headline', 'duration', 'published', 'views',
        'description', 'tags','event', 'speaker_1', 'speaker_2', 'speaker_3', 'speaker_4']
df = talks_all.drop(columns=drop)

# This leaves us only with: Talk_ID, text, talk_gender
# Talk_ID is the dataframe's index!
print(f"Index => {df.index.name}.")
print(df.head(3))

Index => Talk_ID.
                                                      text talk_gender
Talk_ID                                                               
1          Thank you so much, Chris. And it's truly a g...        male
7          (Music: "The Sound of Silence," Simon & Garf...        male
66         Good morning. How are you?    (Laughter)    ...        male


## From Texts to Sentences to SVOs

In [41]:
for i in range(dfM.head(3).shape[0]):
    sentences = sent_tokenize(dfM.iloc[i][['text']].values[0].lower())
    print(sentences[0:1])

['  thank you so much, chris.']
['  (music: "the sound of silence," simon & garfunkel)    hello voice mail, my old friend.']
['  good morning.']


In [None]:
df = df.explode(['text_id']).reset_index().rename(columns={'index' : 'row_id'})
df['row_id'] = df.groupby('row_id').cumcount()

### From Texts to Sentences

What we want to do here is to doc each text, get the sentences, enumerate the sentencs, then feed the sentences into the SVO extractor.

In [None]:
for text in texts_w:
    # create a spacy doc
    nlp(text)
    # get the sentences as a list

    # feed each sentence to the SVO extractor

    # return the SVOt and the matching sentence ID as a dictionary or tuple
    

In [9]:

# Use the pipe method to feed documents 
docs_w = list(nlp.pipe(texts_w))
docs_m = list(nlp.pipe(texts_m))

# A quick check of our work:
docs_m[0]._.preview

'Doc(2690 tokens: "  thank you so much, chris. and it\'s truly a gr...")'

### 1b. SVOs to Dataframe

Since we create SVOs for every sentence in the two subcorpora, why not save both to two dataframes?

In [10]:
def actions(doc_id, doc, svo_list):
    svotriples = list(textacy.extract.triples.subject_verb_object_triples(doc))
    for item in svotriples:
        svo_list.append(
            {
                'doc': doc_id,
                'subject': str(item[0][-1]), 
                'verb': str(item[1][-1]), 
                'object': str(item[2])
            }
        )

In [11]:
# Create the two lists
all_svos_m = []
all_svos_w = []
doc_id = 0

# Populate the lists with SVO triples
for doc in docs_m:
    actions(doc_id, doc, all_svos_m)
    doc_id += 1

for doc in docs_w:
    actions(doc_id, doc, all_svos_w)
    doc_id += 1

# Convert the lists to dataframes
svos_w = pd.DataFrame(all_svos_w)
svos_m = pd.DataFrame(all_svos_m)

print(svos_m.shape[0], svos_w.shape[0])

80550 26610


In [12]:
# Save to CSV files 
# >>> Commented out once run
#svos_w.to_csv("../output/svos_w.csv")
#svos_m.to_csv("../output/svos_m.csv")

## 2: Counts of Sentences vs SVOs <a id='sentences'></a>

The code above suggests that 70% of the SVOs in TED talks have `'i', 'we', 'she', 'he', 'they', 'it', 'you'` as their subject. It's not clear, however, how much the SVO pattern represents all sentences in the talks. In this section we explore counting sentences, both through NLTK and spaCy, but also a hand count of a few sample texts to see how well our code is reflecting underlying realities.

### NLTK

In [14]:
sents_w = [ sent_tokenize(text) for text in texts_w ]    
sents_m = [ sent_tokenize(text) for text in texts_m ]

print(len(sents_w[0]))

187


In [15]:
sent_count_m = 0
for text in texts_m:
    sent_count_m += len(sent_tokenize(text))

sent_count_w = 0
for text in texts_w:
    sent_count_w += len(sent_tokenize(text))

print(f" Female corp sent count: {sent_count_w}\n Male corp sent count: {sent_count_m}")

 Female corp sent count: 30799
 Male corp sent count: 96342


That results in the following percentages of SVOs out of the total number of sentences:

In [16]:
print(f"Female subcorpora: {svos_w.shape[0] / sent_count_w}")
print(f"Male subcorpora: {svos_m.shape[0] / sent_count_m}")

Female subcorpora: 0.8639890905548882
Male subcorpora: 0.8360839509248303


### spaCy

Our spaCy documents already exist, so we just need to use the `.sents` method to call the sentences and count them.

In [17]:
snt_cnt_w = 0
for doc in docs_w:
    snt_cnt_w += len(list(doc.sents))

snt_cnt_m = 0
for doc in docs_m:
    snt_cnt_m += len(list(doc.sents))

print(f"F: {snt_cnt_w}, M: {snt_cnt_m}.")

F: 31927, M: 99822.


In [18]:
print(f"F: {svos_w.shape[0] / snt_cnt_w}")
print(f"M: {svos_m.shape[0] / snt_cnt_m}")

F: 0.8334638393835938
M: 0.8069363466971209


The total sentence counts are:
```
Women - NLTK : 30,799 with SVO ratio of 86%
        spaCy: 31,673 with SVO ratio of 84%
Men -   NLTK : 96,342 with SVO ratio of 83%
        spaCy: 99,039 with SVO ratio of 80%
```

### Post-SVO Lemmatizing

Two possible approaches to lemmatizing verbs in a dataframe:
* [How to lemmatise a dataframe column Python - Stack Overflow](https://stackoverflow.com/questions/61987040/how-to-lemmatise-a-dataframe-column-python)
* [dataframe - lemmatizing a verb list in a data frame in Python - Stack Overflow](https://stackoverflow.com/questions/72394840/lemmatizing-a-verb-list-in-a-data-frame-in-python)

In [19]:
from nltk.stem import WordNetLemmatizer

In [20]:
# https://www.nltk.org/_modules/nltk/stem/wordnet.html
wnl = WordNetLemmatizer()
svos_w.verb = svos_w.verb.map(lambda word: wnl.lemmatize(word, pos="v"))

In [21]:
svos_w.shape

(26610, 4)

In [22]:
svos_m.verb = svos_m.verb.map(lambda word: wnl.lemmatize(word, pos="v"))

In [23]:
# Save to CSV files 
# >>> Commented out once run
# svos_w.to_csv("../output/svos_w_lem.csv")
# svos_m.to_csv("../output/svos_m_lem.csv")