# Identify gender from wikipedia?
By simply comparing the number of masculine / feminine pronouns in wikipedia articles, can we correctly guess people's gender?

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import wikipedia
import re

## Quickly grab data from IMDB

In [3]:
def scrapePage(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    boxes = soup.find_all('div', {'class': 'lister-item mode-detail'})
    boxList = []
    for box in boxes:
        name = box.find('h3', {'class': 'lister-item-header'}).a.text.strip()
        isActor = box.find('p', {'class': 'text-muted text-small'}).text.split('|')[0]
        if 'Actor' in isActor:
            gender = 'M' 
        elif 'Actress' in isActor:
            gender = 'F'
        else:
            gender = '?'
        boxList.append({'name': name, 
                        'gender': gender
                       })
    return boxList

# Uncomment when it needs to be used
# url = 'https://www.imdb.com/list/ls058011111/?sort=list_order,asc&mode=detail&page=' # must add page number
# allData = []
# for i in range(1, 11):
#     print(i)
#     allData += scrapePage(url + str(i))
# df = pd.DataFrame(allData)
# df.to_csv('actorGenders.csv', index=False)

df = pd.read_csv('actorGenders.csv')
df.head()
df_n = df.head()
df_n.shape

(5, 2)

In [4]:
df = df[df['gender'] != '?']
df.sample(10)

Unnamed: 0,gender,name
417,M,Dwayne Johnson
723,M,Stanley Tucci
22,M,George Clooney
299,F,Kirsten Dunst
828,F,Sarah Michelle Gellar
833,F,Parker Posey
998,M,John Turturro
106,M,Joe Pesci
137,F,Norma Shearer
559,M,Richard Harris


## Implement Heuristic

In [5]:
male = ['he', 'his', 'him']
female = ['her', 'hers', 'she']
regexM = re.compile(r"^i\W|\Wi\W")


def genderGuess(row, threshold=0):
    name = row['name']
    try:
        cont = wikipedia.page(name).content.lower()
        words = cont.split(' ')
        he = len([w for w in words if w in male])
        she = len([w for w in words if w in female])
        percDif = abs(he - she) / ((he + she) / 2)
        if percDif < threshold:
            return '?' # does not pass the threshold
        else:
            return 'F' if she >= he else 'M' 
    except:
        return 'e' # error

In [6]:
df['guess'] = df.apply(genderGuess, axis=1)

Robert De Niro
Jack Nicholson
Marlon Brando
Leonardo DiCaprio
Humphrey Bogart
Johnny Depp
Al Pacino
Denzel Washington
Laurence Olivier
Brad Pitt
Daniel Day-Lewis
Tom Cruise
Cary Grant
Dustin Hoffman
Clark Gable
Sean Penn
Christian Bale
Gregory Peck
Sidney Poitier
Harrison Ford
Spencer Tracy
George Clooney
Charlton Heston
Morgan Freeman
Katharine Hepburn
Meryl Streep
Ingrid Bergman
Marilyn Monroe
Jennifer Lawrence
Kate Winslet
Elizabeth Taylor
Cate Blanchett
Audrey Hepburn
Helen Mirren
Bette Davis
Viola Davis
Nicole Kidman
Natalie Portman
Jodie Foster
Judi Dench
Amy Adams
Julia Roberts
Diane Keaton
Grace Kelly
Shirley MacLaine
Reese Witherspoon
Charlize Theron
Judy Garland
John Wayne
Paul Newman
Anthony Hopkins
Matt Damon
Russell Crowe
Robert Duvall
James Dean
Kirk Douglas
Henry Fonda
Robin Williams
Orson Welles
Christoph Waltz
Heath Ledger
Sean Connery
Kevin Spacey
Gene Hackman
Liam Neeson
Edward Norton
Bruce Willis
Gary Cooper
Philip Seymour Hoffman
Robert Redford
Ralph Fiennes
Steve 

James Gandolfini
Michael B. Jordan
Anthony Perkins
David Thewlis
Klaus Kinski
Malcolm McDowell
Ray Winstone
Jean-Paul Belmondo
Andy Serkis
Matthew Broderick
Dennis Hopper
Michael Rooker
Vincent Gallo
Toshirô Mifune
Vincent Cassel
J.K. Simmons
Boris Karloff
Peter Lorre
Antonio Banderas
Paddy Considine
Ryan Phillippe
Kerry Washington
Carrie-Anne Moss
Janet Leigh
Catalina Sandino Moreno
Samantha Morton
Keisha Castle-Hughes
Salma Hayek
Joan Allen
Janet McTeer
Fernanda Montenegro
Kristin Scott Thomas
Brenda Blethyn
Elisabeth Shue
Miranda Richardson
Stockard Channing
Angela Bassett
Mary McDonnell
Lucille Ball
Laura Dern
Pauline Collins
Isabelle Adjani
Melanie Griffith
Sally Kirkland
Jane Alexander
Marsha Mason
Freddie Highmore
Michael J. Fox
Eddie Murphy
Yun-Fat Chow
Bruce Lee
Jet Li
Chuck Norris
Jean-Claude Van Damme
Sam Rockwell
Takashi Shimura
Richard E. Grant
Leslie Nielsen
Simon Pegg
John Malkovich
Michael Shannon
Martin Sheen
Christopher Guest
Alan Rickman
Jackie Earle Haley
Montgomery



  lis = BeautifulSoup(html).find_all('li')


Marlon Wayans
Jai Courtney
Albert Brooks
Macaulay Culkin
Sam Elliott
Jim Caviezel
Paul Scofield
Jared Harris
Harpo Marx
Jason Sudeikis
Josh Hartnett
Patrick Wilson
Cillian Murphy
Billy Crudup
Christopher Mintz-Plasse
Anthony Mackie
Timothy Olyphant
Josh Gad
Luke Wilson
Peter Sarsgaard
Rhys Ifans
Topher Grace
Kevin Hart
Paul Dano
Lenny Kravitz
Bill Hader
Rob Lowe
David Spade
Rob Riggle
Lance Henriksen
Richard Jenkins
Mike Vogel
Michael Ealy
Freddie Prinze Jr.
Jamie Bell
Danny McBride
Jonathan Rhys Meyers
Michael Cera
Bill Paxton
Chris Klein
Ioan Gruffudd
Andrew Lincoln
Bill Pullman
Craig Robinson
Charlie Day
Andy Samberg
Garrett Hedlund
Cam Gigandet
John Krasinski
Kevin James
Breckin Meyer
Christopher Reeve
James McAvoy
Rufus Sewell
Ned Beatty
John Cleese
John Turturro
Jerry Lewis


## Does it work?

In [8]:
df[df['guess'] != df['gender']]

Unnamed: 0,gender,name,guess
13,M,Cary Grant,?
940,M,Common,?


In [11]:
genderGuess({'name': 'The Color Purple'})

The Color Purple


'F'