# SVO Verbs  Lemmatized

Initial note on lemmatizing: https://stackoverflow.com/questions/51658153/lemmatize-a-doc-with-spacy

In [20]:
# IMPORTS
import re, spacy, textacy
import pandas as pd
import numpy as np


In [2]:
# DATA LOAD
# Loading the Data in a gendered partitioned fashion: 
talks_m = pd.read_csv('../output/talks_male.csv', index_col='Talk_ID')
talks_f = pd.read_csv('../output/talks_female.csv', index_col='Talk_ID')
talks_nog = pd.read_csv('../output/talks_nog.csv', index_col='Talk_ID')
talks_all = pd.concat([talks_m, talks_f, talks_nog])

# GETTING JUST THE TEXTS
texts_all = talks_all.text.tolist()
texts_women = talks_f.text.tolist()
texts_men = talks_m.text.tolist()

# Lowercase everything before we create spaCy doc and Textacy SVO triple
texts_w = [text.lower() for text in texts_women]
texts_m = [text.lower() for text in texts_men]

In [3]:
# Get talkIDs in order
m_talkIDs = talks_m.index.values.tolist()
w_talkIDs = talks_f.index.values.tolist()

## From Texts to SVOs

In [4]:
# Load the Space pipeline to be used
nlp = spacy.load('en_core_web_sm')
# nlp.add_pipe('sentencizer')
# nlp.remove_pipe("lemmatizer")
# nlp.add_pipe("lemmatizer", config={"mode": "lookup"}).initialize()

In [3]:
# Use the pipe method to feed documents 
docs_w = list(nlp.pipe(texts_w))
docs_m = list(nlp.pipe(texts_m))

# A quick check of our work:
docs_m[0]._.preview

'Doc(2690 tokens: "  thank you so much, chris. and it\'s truly a gr...")'

## SVOs to Dataframe

Since we create SVOs for every sentence in the two subcorpora, why not save both to two dataframes?

In [None]:
def createSVOs(doc, svo_list):
    # Create the list of tuples for the document
    svotriples = list(textacy.extract.triples.subject_verb_object_triples(doc))
    # Convert to list of dictionaries
    for item in svotriples:
        svo_list.append(
            {
                'subject': str(item[0][-1]), 
                'verb': str(item[1][-1]), 
                'object': str(item[2])
            }
        )

In [13]:
# Code based on above function by JL

def createSVOs_by_talk(doc, all_svo_list):
    # Create the list of tuples for the document
    svotriples = list(textacy.extract.triples.subject_verb_object_triples(doc))
    # Convert to list of dictionaries
    talk_list = []
    for item in svotriples:
        talk_list.append(
            {
                'subject': str(item[0][-1]), 
                'verb': str(item[1][-1]), 
                'object': str(item[2])
            }
        )
    all_svo_list.append(talk_list)

In [14]:
def createSVOs_tID(doc, svo_list, talkID=None):
    # Create the list of tuples for the document
    svotriples = list(textacy.extract.triples.subject_verb_object_triples(doc))
    # Convert to list of dictionaries
    for item in svotriples:
        svo_list.append(
            {
                'subject': str(item[0][-1]), 
                'verb': str(item[1][-1]), 
                'object': str(item[2]),
                'TalkID':talkID
            }
        )

### Now at Scale

In [18]:
# Create the two lists
svos_m_by_talk = []
svos_w_by_talk = []

# Populate the lists with SVO triples
for m in range(len(m_talkIDs)):
    doc = docs_m[m]
    talkID = m_talkIDs[m]
    createSVOs_tID(doc, svos_m_by_talk, talkID)

for w in range(len(w_talkIDs)):
    doc = docs_w[w]
    talkID = w_talkIDs[w]
    createSVOs_tID(doc, svos_w_by_talk, talkID)

In [19]:
svos_m_by_talk

[{'subject': 'i', 'verb': 'blown', 'object': '[conference]', 'TalkID': 1},
 {'subject': 'i',
  'verb': 'want',
  'object': '[to, thank, all, of, you, for, the, many, nice, comments, about, what, i, had, to, say, the, other, night]',
  'TalkID': 1},
 {'subject': 'i', 'verb': 'need', 'object': '[that]', 'TalkID': 1},
 {'subject': 'laughter', 'verb': 'put', 'object': '[yourselves]', 'TalkID': 1},
 {'subject': 'i',
  'verb': 'have',
  'object': '[to, take, off, my, shoes, or, boots, to, get, on, an, airplane]',
  'TalkID': 1},
 {'subject': 'i', 'verb': 'tell', 'object': '[story]', 'TalkID': 1},
 {'subject': 'i', 'verb': 'left', 'object': '[white, house]', 'TalkID': 1},
 {'subject': 'i', 'verb': 'looked', 'object': '[me]', 'TalkID': 1},
 {'subject': 'it', 'verb': 'hit', 'object': '[me]', 'TalkID': 1},
 {'subject': 'we',
  'verb': 'started',
  'object': '[looking, for, a, place, to, eat]',
  'TalkID': 1},
 {'subject': 'we',
  'verb': 'got',
  'object': '[to, exit, 238, ,, lebanon, ,, tenness

In [20]:
svos_mbt = pd.DataFrame(svos_m_by_talk)
svos_wbt = pd.DataFrame(svos_w_by_talk)

In [21]:
svos_mbt

Unnamed: 0,subject,verb,object,TalkID
0,i,blown,[conference],1
1,i,want,"[to, thank, all, of, you, for, the, many, nice...",1
2,i,need,[that],1
3,laughter,put,[yourselves],1
4,i,have,"[to, take, off, my, shoes, or, boots, to, get,...",1
...,...,...,...,...
80545,you,imagine,[him],10807
80546,you,see,"[credit, rating]",10807
80547,we,do,[what],10807
80548,it,expose,[humanity],10807


### Post-SVO Lemmatizing

Two possible approaches to lemmatizing verbs in a dataframe:
* [How to lemmatise a dataframe column Python - Stack Overflow](https://stackoverflow.com/questions/61987040/how-to-lemmatise-a-dataframe-column-python)
* [dataframe - lemmatizing a verb list in a data frame in Python - Stack Overflow](https://stackoverflow.com/questions/72394840/lemmatizing-a-verb-list-in-a-data-frame-in-python)

In [22]:
from nltk.stem import WordNetLemmatizer

In [23]:
# https://www.nltk.org/_modules/nltk/stem/wordnet.html
wnl = WordNetLemmatizer()
svos_wbt.verb.map(lambda word: wnl.lemmatize(word, pos="v"))

0          save
1          turn
2        handle
3         bring
4          have
          ...  
26605      have
26606       sit
26607      have
26608       use
26609     imply
Name: verb, Length: 26610, dtype: object

In [24]:
svos_wbt.shape

(26610, 4)

In [25]:
svos_wbt


Unnamed: 0,subject,verb,object,TalkID
0,development,save,[us],53
1,she,turned,"[to, be, a, much, bigger, dog, than, i, 'd, an...",53
2,part,handled,[percent],53
3,that,bring,"[truck, trips]",53
4,area,has,[one],53
...,...,...,...,...
26605,we,have,"[sofas, fruits, hairbrushes, bookshelves, toil...",8606
26606,we,sit,"[palestine, tanzania]",8606
26607,we,have,[lot],8606
26608,we,use,[photos],8606


In [27]:
svos_mbt.verb = svos_mbt.verb.map(lambda word: wnl.lemmatize(word, pos="v"))

In [29]:
svos_wbt.verb = svos_wbt.verb.map(lambda word: wnl.lemmatize(word, pos="v"))

In [32]:
svos_wbt.shape[0]+svos_mbt.shape[0]

107160

In [31]:
# Save to CSV files 
# >>> Commented out once run
svos_wbt.to_csv("../output/svos_wbt_lem.csv")
svos_mbt.to_csv("../output/svos_mbt_lem.csv")

In [11]:
svos_wbt = pd.read_csv('../output/svos_wbt_lem.csv')
svos_mbt = pd.read_csv('../output/svos_mbt_lem.csv')

svos_wbt = svos_wbt.drop(columns = ["Unnamed: 0"])
svos_mbt = svos_mbt.drop(columns = ["Unnamed: 0"])

Unnamed: 0,subject,verb,object,TalkID
0,development,save,[us],53
1,she,turn,"[to, be, a, much, bigger, dog, than, i, 'd, an...",53
2,part,handle,[percent],53
3,that,bring,"[truck, trips]",53
4,area,have,[one],53
...,...,...,...,...
80545,you,imagine,[him],10807
80546,you,see,"[credit, rating]",10807
80547,we,do,[what],10807
80548,it,expose,[humanity],10807


## SV Pairs with "She", "He", and "I"

Merging code from `Gender-SV-TFIDF-ajbc` into here

In [14]:
# From Allison code

# Create a list of the pronouns we want to see
pronouns = ["he", "she", "i"]

# Function to retrive the count the number of times subjects,
# here a list of pronouns, are paired with verbs
def svPairs (df, pplist):    
    temp = df[df["subject"].isin(
        pplist)].value_counts(
        subset=['subject', 'verb']).reset_index()
    # Re-label the new column from "count"
    temp.rename(columns={0 : 'term_freq'}, inplace=True)
    # Allison's OG code
    #temp.rename(columns={'count' : 'term_freq'}, inplace=True)
    
    # Add a column with relative frequency
    temp['term_rfreq'] = temp['term_freq'] / df.shape[0]

    temp2 = df[df["subject"].isin(
        pplist)].value_counts(
        subset=['TalkID', 'subject', 'verb']).reset_index()
    
    # Add below line 
    temp2.rename(columns={0 : 'term_freq'}, inplace=True)
    # Re-label the new column from "count"
    return temp, temp2

In [15]:
m_pp_v, m_pp_v_doc  = svPairs(svos_mbt, pronouns)
w_pp_v, w_pp_v_doc = svPairs(svos_wbt, pronouns)

print(m_pp_v.shape)
print(w_pp_v.shape)

(1718, 4)
(1057, 4)


In [16]:
m_pp_v.head()

Unnamed: 0,subject,verb,term_freq,term_rfreq
0,i,have,1310,0.016263
1,i,want,1207,0.014984
2,i,go,1062,0.013184
3,i,do,645,0.008007
4,i,get,589,0.007312


In [17]:
m_pp_v_doc.head()

Unnamed: 0,TalkID,subject,verb,term_freq
0,353,i,want,21
1,172,i,have,19
2,188,i,get,18
3,114,i,want,17
4,231,i,do,17


In [18]:
def svPairDocFreq(df):  
    """This counts the number of documents that the SV pair occurs in"""
    temp = df.value_counts(
        subset=['subject', 'verb']).reset_index()

    # Allison's OG code
    # temp.rename(columns={'count' : 'doc_freq'}, inplace=True)
    temp.rename(columns={0 : 'doc_freq'}, inplace=True)
    return temp

In [24]:
m_pp_v_df  = svPairDocFreq(m_pp_v_doc)
m_pp_v_df.head()

Unnamed: 0,subject,verb,doc_freq
0,i,have,432
1,i,want,430
2,i,go,376
3,i,get,291
4,i,do,274


In [25]:
m_pp_v.head()

Unnamed: 0,subject,verb,term_freq,term_rfreq
0,i,have,1310,0.016263
1,i,want,1207,0.014984
2,i,go,1062,0.013184
3,i,do,645,0.008007
4,i,get,589,0.007312


In [26]:
m_pp_v_df = m_pp_v_df.merge(m_pp_v)

In [27]:
m_pp_v_df.head()

Unnamed: 0,subject,verb,doc_freq,term_freq,term_rfreq
0,i,have,432,1310,0.016263
1,i,want,430,1207,0.014984
2,i,go,376,1062,0.013184
3,i,get,291,589,0.007312
4,i,do,274,645,0.008007


In [85]:
# Get document frequencies
m_pp_v_df  = svPairDocFreq(m_pp_v_doc)
w_pp_v_df = svPairDocFreq(w_pp_v_doc)

m_pp_v_df = m_pp_v_df.merge(m_pp_v)
w_pp_v_df = w_pp_v_df.merge(w_pp_v)

m_docs = len(pd.unique(m_pp_v_doc['TalkID']))
w_docs = len(pd.unique(w_pp_v_doc['TalkID']))

m_pp_v_df['tfidf'] = m_pp_v_df['term_freq'] / np.log(m_docs / m_pp_v_df['doc_freq'])
w_pp_v_df['tfidf'] = w_pp_v_df['term_freq'] / np.log(w_docs / w_pp_v_df['doc_freq'])
#m_pp_v_df['tf_gidf'] = m_pp_v_df['term_freq'] / np.log(m_docs / m_pp_v_df['doc_freq'])
#w_pp_v_df['tf_gidf'] = w_pp_v_df['term_freq'] / np.log(w_docs / w_pp_v_df['doc_freq'])

#m_pp_v_df['tfidf'] = m_pp_v_df['term_freq'] / np.log((m_docs+w_docs) / m_pp_v_df['doc_freq'])
#w_pp_v_df['tfidf'] = w_pp_v_df['term_freq'] / np.log((m_docs+w_docs) / w_pp_v_df['doc_freq'])

m_pp_v_df['speaker_gender'] = 'man'
w_pp_v_df['speaker_gender'] = 'woman'
df = pd.concat([m_pp_v_df, w_pp_v_df])
df['tuple'] = df["subject"] + ' ' + df["verb"]
df=df.reindex(columns=['speaker_gender', 'subject', 'verb', 'term_freq', 'doc_freq', 'tfidf'])

df

Unnamed: 0,speaker_gender,subject,verb,term_freq,doc_freq,tfidf
0,man,i,have,1310,432,2690.317896
1,man,i,want,1207,430,2455.389603
2,man,i,go,1062,376,1697.115268
3,man,i,get,589,291,667.774996
4,man,i,do,645,274,684.547118
...,...,...,...,...,...,...
1052,woman,i,jacques,1,1,0.180592
1053,woman,i,judge,1,1,0.180592
1054,woman,i,kick,1,1,0.180592
1055,woman,i,knit,1,1,0.180592


In [86]:
df.to_csv("../output/tfidf_by_speaker.csv")

## Get TFIDF for the SV regardless of speaker

In [33]:
svos_bt = pd.concat([svos_wbt,svos_mbt])

In [34]:
a_pp_v, a_pp_v_doc  = svPairs(svos_bt, pronouns)


In [42]:
a_pp_v

Unnamed: 0,subject,verb,term_freq,term_rfreq
0,i,have,1891,0.017647
1,i,want,1634,0.015248
2,i,go,1389,0.012962
3,i,do,864,0.008063
4,i,get,787,0.007344
...,...,...,...,...
2086,i,acknowledge,1,0.000009
2087,he,note,1,0.000009
2088,i,rationalize,1,0.000009
2089,he,not,1,0.000009


In [43]:
a_pp_v_df  = svPairDocFreq(a_pp_v_doc)


In [44]:
a_pp_v_df

Unnamed: 0,subject,verb,doc_freq
0,i,have,599
1,i,want,576
2,i,go,508
3,i,get,390
4,i,do,375
...,...,...,...
2086,i,market,1
2087,i,mark,1
2088,i,maria,1
2089,i,mar,1


In [83]:
a_pp_v_df = a_pp_v_df.merge(a_pp_v)

In [84]:
a_pp_v_df

Unnamed: 0,subject,verb,doc_freq,term_freq,term_rfreq,tfidf,tuple
0,i,have,599,1891,0.017647,4035.925988,i have
1,i,want,576,1634,0.015248,3218.463148,i want
2,i,go,508,1389,0.012962,2193.197336,i go
3,i,get,390,787,0.007344,876.727196,i get
4,i,do,375,864,0.008063,922.212481,i do
...,...,...,...,...,...,...,...
2086,i,market,1,1,0.000009,0.145692,i market
2087,i,mark,1,1,0.000009,0.145692,i mark
2088,i,maria,1,1,0.000009,0.145692,i maria
2089,i,mar,1,1,0.000009,0.145692,i mar


In [50]:
# Get document frequencies
a_pp_v_df  = svPairDocFreq(a_pp_v_doc)

a_pp_v_df = a_pp_v_df.merge(a_pp_v)

a_docs = len(pd.unique(a_pp_v_doc['TalkID']))

a_pp_v_df['tfidf'] = a_pp_v_df['term_freq'] / np.log(a_docs / a_pp_v_df['doc_freq'])


af = a_pp_v_df
af['tuple'] = af["subject"] + ' ' + af["verb"]
af=af.reindex(columns=['subject', 'verb', 'term_freq', 'doc_freq', 'tfidf'])

af

Unnamed: 0,subject,verb,term_freq,doc_freq,tfidf
0,i,have,1891,599,4035.925988
1,i,want,1634,576,3218.463148
2,i,go,1389,508,2193.197336
3,i,get,787,390,876.727196
4,i,do,864,375,922.212481
...,...,...,...,...,...
2086,i,market,1,1,0.145692
2087,i,mark,1,1,0.145692
2088,i,maria,1,1,0.145692
2089,i,mar,1,1,0.145692


In [88]:
af.to_csv("../output/tfidf_all_speakers.csv")

## Get TFIDF for the verbs

In [51]:
# From Allison code

# Create a list of the pronouns we want to see
pronouns = ["he", "she", "i"]

# Function to retrive the count the number of times verbs
# appear with the given list of pronouns
def v_subset_pronouns(df, pplist):    
    temp = df[df["subject"].isin(pplist)].value_counts(
        subset=['verb']).reset_index()
    # Re-label the new column from "count"
    temp.rename(columns={0 : 'term_freq'}, inplace=True)
    # Allison's OG code
    #temp.rename(columns={'count' : 'term_freq'}, inplace=True)
    
    # Add a column with relative frequency
    temp['term_rfreq'] = temp['term_freq'] / df.shape[0]

    temp2 = df[df["subject"].isin(
        pplist)].value_counts(
        subset=['TalkID', 'verb']).reset_index()
    
    # Add below line 
    temp2.rename(columns={0 : 'term_freq'}, inplace=True)
    # Re-label the new column from "count"
    return temp, temp2

In [93]:
m_v, m_v_doc  = v_subset_pronouns(svos_mbt, pronouns)
w_v, w_v_doc  = v_subset_pronouns(svos_wbt, pronouns)

In [78]:
def vDocFreq(df):  
    """This counts the number of documents that the verb occurs in"""
    temp = df.value_counts(
        subset=['verb']).reset_index()

    # Allison's OG code
    # temp.rename(columns={'count' : 'doc_freq'}, inplace=True)
    temp.rename(columns={0 : 'doc_freq'}, inplace=True)
    return temp

In [90]:
m_v_df  = vDocFreq(m_v_doc)
m_v_df.head()

Unnamed: 0,verb,doc_freq
0,have,484
1,want,451
2,go,405
3,get,330
4,do,309


In [96]:
# Get document frequencies
m_v_df  = vDocFreq(m_v_doc)
w_v_df  = vDocFreq(w_v_doc)

m_v_df = m_v_df.merge(m_v)
w_v_df = w_v_df.merge(w_v)

m_docs = len(pd.unique(m_pp_v_doc['TalkID']))
w_docs = len(pd.unique(w_pp_v_doc['TalkID']))

m_v_df['tfidf'] = m_v_df['term_freq'] / np.log(m_docs / m_v_df['doc_freq'])
w_v_df['tfidf'] = w_v_df['term_freq'] / np.log(w_docs / w_v_df['doc_freq'])
#m_pp_v_df['tf_gidf'] = m_pp_v_df['term_freq'] / np.log(m_docs / m_pp_v_df['doc_freq'])
#w_pp_v_df['tf_gidf'] = w_pp_v_df['term_freq'] / np.log(w_docs / w_pp_v_df['doc_freq'])

#m_pp_v_df['tfidf'] = m_pp_v_df['term_freq'] / np.log((m_docs+w_docs) / m_pp_v_df['doc_freq'])
#w_pp_v_df['tfidf'] = w_pp_v_df['term_freq'] / np.log((m_docs+w_docs) / w_pp_v_df['doc_freq'])

m_v_df['speaker_gender'] = 'man'
w_v_df['speaker_gender'] = 'woman'
dv = pd.concat([m_v_df, w_v_df])

dv=dv.reindex(columns=['speaker_gender', 'verb', 'term_freq', 'doc_freq', 'tfidf'])

dv

Unnamed: 0,speaker_gender,verb,term_freq,doc_freq,tfidf
0,man,have,1633,484,4374.826039
1,man,want,1310,451,2951.184576
2,man,go,1179,405,2137.922960
3,man,get,697,330,921.635542
4,man,do,790,309,961.052303
...,...,...,...,...,...
710,woman,hunch,1,1,0.180592
711,woman,hound,1,1,0.180592
712,woman,hop,1,1,0.180592
713,woman,hasten,1,1,0.180592


In [98]:
dv.to_csv("../output/tfidf_just_verbs.csv")

## Grab just the I's


For each talk, we are just going to extract the `I` subjects using a version of the code from the `03-Gender-SV-TFIDF-OLD` notebook

In [4]:
# Create a list of the pronouns we want to see
pronouns = ["i"]

# Function to retrive the count the number of times subjects
# I is paired with verbs
def i_svPairs(df, talkIDs):
    # Create an empty data frame
    df_out = pd.DataFrame({'subject' : [], 'verb' : [], 'v_freq' : [], 'v_rfreq' : [], 'TalkID' : []})
    
    # Isolate the SVOs in each talk
    for t in talkIDs:
        temp = df[df["TalkID"] == t]
        temp_i = temp[temp["subject"] == "i"].value_counts(
            subset=['subject', 'verb']).reset_index()
        
        # Re-label the new column from "0" to something human-readable
        temp_i.rename(columns={0:'v_freq'}, inplace=True)

        # Add a column with relative frequency
        temp_i['v_rfreq'] = temp_i['v_freq'] / temp.shape[0]
        
        # Add the talkID back in 
        temp_i["TalkID"] = t
        
        # Concatenate with previous dataframe
        df_out = pd.concat([df_out, temp_i])
    
    # Clean up the output dataframe 
    df_out = df_out.reset_index()
    df_out = df_out.drop(columns = "index")
    
    df_out.v_freq = df_out.v_freq.astype(int)
    df_out.TalkID = df_out.TalkID.astype(int)
    
    return df_out

In [8]:
test = i_svPairs(svos_mbt,m_talkIDs)

In [9]:
test

Unnamed: 0,subject,verb,v_freq,v_rfreq,TalkID
0,i,go,6,0.048000,1
1,i,want,4,0.032000,1
2,i,give,4,0.032000,1
3,i,tell,2,0.016000,1
4,i,get,2,0.016000,1
...,...,...,...,...,...
9638,i,have,1,0.013158,10807
9639,i,know,1,0.013158,10807
9640,i,like,1,0.013158,10807
9641,i,receive,1,0.013158,10807
