In [1]:
import csv
import nltk

### Looking for female roles

### Auxiliar functions for getting adjetives


Given one list of sentences already tagged like `[('In', 'IN'), ('1970s', 'CD'), ('London', 'NNP')]`, the function getNounPrase returns the Noun Phrase Chunks whose match with the regular expression of the variable `grammar`. Here we are looking for a pattern of determinant or posesive (optional) + one or more adjectives (with 'AND' between them) + Noun (or proper noun).

Returns the NP-chunks.


In [2]:
def getNounPhraseChucks(tags):
    
    grammar = "NP: {<DT|PP|PRP\$>?<JJ.*>+<CC>*<JJ.*>*<NN.*>}"  # det(opcional), adj(cero, uno o más), nombre
    cp = nltk.RegexpParser(grammar) # chunk parser
    
    npChucks = []
    
    for tag in tags:
        result = cp.parse(tag)        
        for i in result.subtrees(filter=lambda x: x.label() == 'NP'):
            npChucks.append(i)
    
    return npChucks

Test: 

In [3]:
tagged_sentences = [
                    # first sentence
                    [('In', 'IN'), ('1970s', 'CD'), ('London', 'NNP'), ('amidst', 'VBD'), ('the', 'DT'), ('punk', 'NN'), ('rock', 'NN'), ('revolution', 'NN'), (',', ','), ('a', 'DT'), ('young', 'JJ'), ('grifter', 'NN'), ('named', 'VBN'), ('Estella', 'NNP'), ('is', 'VBZ'), ('determined', 'VBN'), ('to', 'TO'), ('make', 'VB'), ('a', 'DT'), ('name', 'NN'), ('for', 'IN'), ('herself', 'NN'), ('with', 'IN'), ('her', 'PRP$'), ('designs', 'NNS'), ('.', '.')], 
                    # sencond sentence
                    [('She', 'PRP'), ('befriends', 'VBZ'), ('a', 'DT'), ('pair', 'NN'), ('of', 'IN'), ('young', 'JJ'), ('thieves', 'NNS'), ('who', 'WP'), ('appreciate', 'VBP'), ('her', 'PRP$'), ('appetite', 'NN'), ('for', 'IN'), ('mischief', 'NN'), (',', ','), ('and', 'CC'), ('together', 'RB'), ('they', 'PRP'), ('are', 'VBP'), ('able', 'JJ'), ('to', 'TO'), ('build', 'VB'), ('a', 'DT'), ('life', 'NN'), ('for', 'IN'), ('themselves', 'PRP'), ('on', 'IN'), ('the', 'DT'), ('London', 'NNP'), ('streets', 'NNS'), ('.', '.')], [('One', 'CD'), ('day', 'NN'), (',', ','), ('Estella', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('flair', 'NN'), ('for', 'IN'), ('fashion', 'NN'), ('catches', 'VBZ'), ('the', 'DT'), ('eye', 'NN'), ('of', 'IN'), ('the', 'DT'), ('Baroness', 'NNP'), ('von', 'NNP'), ('Hellman', 'NNP'), (',', ','), ('a', 'DT'), ('fashion', 'NN'), ('legend', 'NN'), ('who', 'WP'), ('is', 'VBZ'), ('devastatingly', 'RB'), ('chic', 'JJ'), ('and', 'CC'), ('terrifyingly', 'RB'), ('haute', 'NN'), ('.', '.')], 
                    # third sentence
                    [('But', 'CC'), ('their', 'PRP$'), ('relationship', 'NN'), ('sets', 'NNS'), ('in', 'IN'), ('motion', 'NN'), ('a', 'DT'), ('course', 'NN'), ('of', 'IN'), ('events', 'NNS'), ('and', 'CC'), ('revelations', 'NNS'), ('that', 'WDT'), ('will', 'MD'), ('cause', 'VB'), ('Estella', 'NNP'), ('to', 'TO'), ('embrace', 'VB'), ('her', 'PRP$'), ('wicked', 'JJ'), ('side', 'NN'), ('and', 'CC'), ('become', 'VB'), ('the', 'DT'), ('raucous', 'JJ'), (',', ','), ('fashionable', 'JJ'), ('and', 'CC'), ('revenge-bent', 'JJ'), ('Cruella', 'NNP'), ('.', '.')]]

getNounPhraseChucks(tagged_sentences)

[Tree('NP', [('a', 'DT'), ('young', 'JJ'), ('grifter', 'NN')]),
 Tree('NP', [('young', 'JJ'), ('thieves', 'NNS')]),
 Tree('NP', [('her', 'PRP$'), ('wicked', 'JJ'), ('side', 'NN')]),
 Tree('NP', [('fashionable', 'JJ'), ('and', 'CC'), ('revenge-bent', 'JJ'), ('Cruella', 'NNP')])]

Given a sentence, the function `getTokensTagsAndNPs` returns a `tree` with 
**noun phrase chunks** whose match with the regular expression labered as `grammar` and their 
**words** identified as their **grammar category** (verbs, determinants, adjetives, nouns...)


In [4]:
def getTokensTagsAndNPs(overview):
    sentences = nltk.sent_tokenize(overview) # split in sentences
    tokens = [nltk.word_tokenize(sent) for sent in sentences] #split in words (tokens)
    tags = [nltk.pos_tag(token) for token in tokens] #add tag (noun, verb, adj...)
    expressions = getNounPhraseChucks(tags)
    return expressions

Test:

In [5]:
overview = 'In 1970s London amidst the punk rock revolution, a young grifter named Estella is determined to make a name for herself with her designs. ' \
       'She befriends a pair of young thieves who appreciate her appetite for mischief, and together they are able to build a life for themselves on the London streets. ' \
       'One day, Estella’s flair for fashion catches the eye of the Baroness von Hellman, a fashion legend who is devastatingly chic and terrifyingly haute. '\
       'But their relationship sets in motion a course of events and revelations that will cause Estella to embrace her wicked side and become the raucous, fashionable and revenge-bent Cruella.'

overview = 'In 1800s England, a well-meaning but selfish young woman meddles in the love lives of her friends.'
result = getTokensTagsAndNPs(overview)

result

[Tree('NP', [('a', 'DT'), ('well-meaning', 'JJ'), ('but', 'CC'), ('selfish', 'JJ'), ('young', 'JJ'), ('woman', 'NN')])]

In [6]:
#woman synonyms
female = ['female', 'woman', 'women', 'lady', 'ladies','dame', 'girl', 'girls', 'housewife', 'wife', 'girlfriend', 'daughter','sister', 'mother', 'stepmother', 'aunt', 'grandmother', 'gentlewoman']
#man synonyms
male =   ['man','men', 'gentleman', 'dude', 'guy', 'guys', 'boy', 'boys', 'child', 'male', 'husband', 'boyfriend', 'son', 'brother', 'father', 'stepfather', 'uncle', 'grandfather']


In [7]:
def isChuckPersonalFemale(chuck):
    for item in chuck:            
        if item[0] in female:
            return True    
    return False

def isChuckPersonalMale(chuck):
    for item in chuck:            
        if item[0] in male:
            return True    
    return False

Function comment

In [8]:
def getPersonalChucks(chucks):
    
    personal_chucks_female=[]
    personal_chucks_male=[]
    for ch in chucks:
        if isChuckPersonalFemale(ch):
            personal_chucks_female.append(ch)
        if isChuckPersonalMale(ch):
            personal_chucks_male.append(ch)
        
    return (personal_chucks_female, personal_chucks_male)
    
    

Test: 

In [9]:
from nltk import Tree
result = [Tree('NP', [('its', 'PRP$'), ('future', 'JJ'), ('sets', 'NNS')]), 
          Tree('NP', [('a', 'DT'), ('young', 'JJ'), ('mother', 'NN')]),
          Tree('NP', [('powerful', 'JJ'), ('forces', 'NNS')]), 
          Tree('NP', [('unmarried', 'JJ'), ('women', 'NNS')]),
          Tree('NP', [('a', 'DT'), ('spectacular', 'JJ'), ('battle', 'NN')]), 
          Tree('NP', [('his', 'PRP$'), ('pregnant', 'JJ'), ('wife', 'NN')])]

print(getPersonalChucks(result))


([Tree('NP', [('a', 'DT'), ('young', 'JJ'), ('mother', 'NN')]), Tree('NP', [('unmarried', 'JJ'), ('women', 'NNS')]), Tree('NP', [('his', 'PRP$'), ('pregnant', 'JJ'), ('wife', 'NN')])], [])


In [10]:
def getSegments(chunk, year, movieId):
    
    segments = []
    segments.append(year) #add the movie release year for each segment    
    segments.append(movieId) #add the movie Id for each segment   
    
    for element in chunk:
        if element[1] != 'PRP$' and element[1] != 'DT' and element[1] != 'CC':
            segments.append(element[0].lower())
    
    return segments
        
        

Given a row or line of the CSV where we have the details of the movie (id, title, 4 characters and overview) as an argument, the function `printAdjetives` prints the NP chucks of the overview:

In [11]:
def getAdjetives(row, year):
    
    movieId = row[0]
    title = row[1]
    overview = row[14]
    chucks = getTokensTagsAndNPs(overview)
    personal_chucks_female, personal_chucks_male = getPersonalChucks(chucks)
    
    female_segments = []
    male_segments = []
    
    if len(personal_chucks_female) > 0 or len(personal_chucks_male) > 0:        
        
        if len(personal_chucks_female) > 0:
            for chunk in personal_chucks_female:               
                female_segments.append(getSegments(chunk, year, movieId))                 
        
        if len(personal_chucks_male) > 0:
            for chunk in personal_chucks_male:
                male_segments.append(getSegments(chunk, year, movieId))
                
        
    return female_segments, male_segments

Test:

In [12]:
row = ['337404', 'Cruella', 
       'Estella / Cruella', 'Emma Stone', '54693', 
       'The Baroness', 'Emma Thompson', '7056', 
       'Jasper', 'Joel Fry', '54811', 'Horace', 
       'Paul Walter Hauser', '1294982', 
       'In 1970s London amidst the punk rock revolution, a young girl named Estella is determined to make a name for herself with her designs. ' \
       'She befriends a pair of young women who appreciate her appetite for mischief, and together they are able to build a life for themselves on the London streets. ' \
       'One day, Estella’s flair for fashion catches the eye of the Baroness von Hellman, a fashion legend who is devastatingly chic and terrifyingly haute. '\
       'But their relationship sets in motion a course of events and revelations that will cause Estella to embrace her wicked side and become the raucous, fashionable and revenge-bent Cruella.']

female_segments, male_segments = getAdjetives(row, 2021)
female_segments

[[2021, '337404', 'young', 'girl'], [2021, '337404', 'young', 'women']]

   Open CSV:

In [13]:
total_female_segments = []
total_male_segments = []
    

for year in range(1970, 2022):
    file_title = 'csv/movies_'+str(year)+'.csv'
    with open(file_title, mode='r',  encoding='utf-8') as file:
        csvFile = csv.reader(file, delimiter = ';')

        csv_headings = next(csvFile)   # print (csv_headings)

        for line in csvFile:
            female_segments, male_segments = getAdjetives(line, year)

            if len(female_segments) > 0:
                for segment in female_segments:
                    total_female_segments.append(segment)

            if len(male_segments) > 0:
                for segment in male_segments:
                    total_male_segments.append(segment)
       
        


In [14]:
print("female: ")
print(len(total_female_segments))
print("male: ")
print(len(total_male_segments))

female: 
2191
male: 
2030


Write to csv

In [15]:
def getRowFromSegment(segment):
    row = []
    
    #add year
    row.append(segment[0])
    #add movie id
    row.append(segment[1])
    
    #add three adjetives
    if len(segment) == 4:
        #one adjetive
        row.append(segment[2])
        row.append('')
        row.append('')
        row.append('')
        
    elif len(segment) == 5:
        # two adjetives
        row.append(segment[2])
        row.append(segment[3])
        row.append('')
        row.append('')
        
    elif len(segment) == 6:
        #three adjetives
        row.append(segment[2])
        row.append(segment[3])
        row.append(segment[4])
        row.append('')
        
    elif len(segment) == 7:
        #four adjetives
        row.append(segment[2])
        row.append(segment[3])
        row.append(segment[4])
        row.append(segment[5])
        
    else:
        print("Error! more than four adjetives!!!??")
        print("segment: ", segment)
        row.append(segment[2])
        row.append(segment[3])
        row.append(segment[4])
        row.append(segment[5])
    
    #add noun
    row.append(segment[len(segment)-1])
    
    return row

In [16]:
segment = [2019, 689523, 'black', 'woman']
print(getRowFromSegment(segment))

segment = [2019, 689523, 'black', 'old', 'woman']
print(getRowFromSegment(segment))

segment = [2019, 689523,  'black', 'old', 'pretty', 'woman']
print(getRowFromSegment(segment))

segment = [2019, 689523,  'black', 'old', 'pretty', 'blonde', 'woman']
print(getRowFromSegment(segment))

segment = [2019, 689523,  'black', 'old', 'pretty', 'blonde', 'selfish', 'woman']
print(getRowFromSegment(segment))

[2019, 689523, 'black', '', '', '', 'woman']
[2019, 689523, 'black', 'old', '', '', 'woman']
[2019, 689523, 'black', 'old', 'pretty', '', 'woman']
[2019, 689523, 'black', 'old', 'pretty', 'blonde', 'woman']
Error! more than four adjetives!!!??
segment:  [2019, 689523, 'black', 'old', 'pretty', 'blonde', 'selfish', 'woman']
[2019, 689523, 'black', 'old', 'pretty', 'blonde', 'woman']


In [17]:
header = ['year', 'movieid', 'adj1', 'adj2', 'adj3', 'adj4', 'noun']

# open the file in the write mode
female_file = open('csv/female_file.csv', 'w', encoding='UTF8', newline='')

# create the csv writer
female_writer = csv.writer(female_file, delimiter = ';')

# write the header to the csv file
female_writer.writerow(header)

for segment in total_female_segments:
    # write a row to the csv file
    female_writer.writerow(getRowFromSegment(segment))

# close the file
female_file.close()

Error! more than four adjetives!!!??
segment:  [1973, '70781', 'beautiful', 'thoroughly', 'modern', 'young', 'french', 'women']


In [18]:
header = ['year', 'adj1', 'adj2', 'adj3', 'adj4', 'noun']

# open the file in the write mode
male_file = open('csv/male_file.csv', 'w', encoding='UTF8', newline='')

# create the csv writer
male_writer = csv.writer(male_file, delimiter = ';')

# write the header to the csv file
male_writer.writerow(header)

for segment in total_male_segments:
    # write a row to the csv file
    male_writer.writerow(getRowFromSegment(segment))

# close the file
male_file.close()