# Example:
### Scraping information from a website.
### Step 1.
First we import all the tools we are going to use.

In [1]:
import pandas as pd
import requests, re
import bs4
from bs4 import BeautifulSoup
import numpy as np
import time 
from time import sleep
from random import randint
from pandas import json_normalize
import nltk
# nltk.download() #this opens up a gui, uncomment on your first run

In [2]:
from nltk.corpus import stopwords

### Step 2: Import dataset
We are interested in the ethnicities of actors from a specific dataset. Now we import our actors dataset(previously cleaned) as actors.
If your dataset is already clean then skip down to step 3. 

Or you can import my cleaned dataset with
> actors = pd.read_csv('actors.csv').


### Clean up (OPTIONAL)

In [3]:
#quick clean up
df_names = pd.read_csv('IMDb names.csv')
df_names.head()

Unnamed: 0,imdb_name_id,name,birth_name,height,bio,birth_details,date_of_birth,place_of_birth,death_details,date_of_death,place_of_death,reason_of_death,spouses_string,spouses,divorces,spouses_with_children,children
0,nm0000001,Fred Astaire,Frederic Austerlitz Jr.,177.0,"Fred Astaire was born in Omaha, Nebraska, to J...","May 10, 1899 in Omaha, Nebraska, USA",1899-05-10,"Omaha, Nebraska, USA","June 22, 1987 in Los Angeles, California, USA ...",1987-06-22,"Los Angeles, California, USA",pneumonia,Robyn Smith (27 June 1980 - 22 June 1987) (hi...,2,0,1,2
1,nm0000002,Lauren Bacall,Betty Joan Perske,174.0,Lauren Bacall was born Betty Joan Perske on Se...,"September 16, 1924 in The Bronx, New York City...",1924-09-16,"The Bronx, New York City, New York, USA","August 12, 2014 in New York City, New York, US...",2014-08-12,"New York City, New York, USA",stroke,Jason Robards (4 July 1961 - 10 September 196...,2,1,2,3
2,nm0000003,Brigitte Bardot,Brigitte Bardot,166.0,"Brigitte Bardot was born on September 28, 1934...","September 28, 1934 in Paris, France",1934-09-28,"Paris, France",,,,,Bernard d'Ormale (16 August 1992 - present)\n...,4,3,1,1
3,nm0000004,John Belushi,John Adam Belushi,170.0,"John Belushi was born in Chicago, Illinois, US...","January 24, 1949 in Chicago, Illinois, USA",1949-01-24,"Chicago, Illinois, USA","March 5, 1982 in Hollywood, Los Angeles, Calif...",1982-03-05,"Hollywood, Los Angeles, California, USA",acute cocaine and heroin intoxication,Judith Belushi-Pisano (31 December 1976 - 5 M...,1,0,0,0
4,nm0000005,Ingmar Bergman,Ernst Ingmar Bergman,179.0,"Ernst Ingmar Bergman was born July 14, 1918, t...","July 14, 1918 in Uppsala, Uppsala län, Sweden",1918-07-14,"Uppsala, Uppsala län, Sweden","July 30, 2007 in Fårö, Gotlands län, Sweden (...",2007-07-30,"Fårö, Gotlands län, Sweden",natural causes,Ingrid Bergman (11 November 1971 - 20 May 199...,5,4,5,8


In [4]:
# drop height and other details, keep names and name id
df_names = df_names[['imdb_name_id','name','date_of_birth','place_of_birth','bio']]
df_names = df_names.rename(columns={'date_of_birth':'dob', "place_of_birth":'pob'})
df_names.head()


Unnamed: 0,imdb_name_id,name,dob,pob,bio
0,nm0000001,Fred Astaire,1899-05-10,"Omaha, Nebraska, USA","Fred Astaire was born in Omaha, Nebraska, to J..."
1,nm0000002,Lauren Bacall,1924-09-16,"The Bronx, New York City, New York, USA",Lauren Bacall was born Betty Joan Perske on Se...
2,nm0000003,Brigitte Bardot,1934-09-28,"Paris, France","Brigitte Bardot was born on September 28, 1934..."
3,nm0000004,John Belushi,1949-01-24,"Chicago, Illinois, USA","John Belushi was born in Chicago, Illinois, US..."
4,nm0000005,Ingmar Bergman,1918-07-14,"Uppsala, Uppsala län, Sweden","Ernst Ingmar Bergman was born July 14, 1918, t..."


### For the purposes of a quick explanation, assume we only want actors born in California or Poland and born on or after 1950.


In [5]:
df_names=df_names.dropna() # <- not recommended but used for the purposes of my example
desired_date = '1950-01-01'
df_desired_names = df_names[ (df_names['dob'] >= desired_date) & (df_names['pob'].str.contains('California|Texas')) & (df_names['bio'].str.contains('actor|actress|acting|acted')   ) ]
# df_desired_names.head()
len(df_desired_names)

2951

#### our dataframe is still pretty big so let's just use the first 30 results and store them in a dataframe named actors.
    

In [6]:
actors = df_desired_names.head(30)
# actors = df_desired_names.head(10)
actors.head()

Unnamed: 0,imdb_name_id,name,dob,pob,bio
96,nm0000098,Jennifer Aniston,1969-02-11,"Sherman Oaks, California, USA","Jennifer Aniston was born in Sherman Oaks, Cal..."
101,nm0000103,Fairuza Balk,1974-05-21,"Point Reyes, California, USA","""Fairuza!"" (""Turquoise"" in Farsi), her father ..."
104,nm0000106,Drew Barrymore,1975-02-22,"Culver City, California, USA",Since melting audiences' hearts - at the age o...
113,nm0000115,Nicolas Cage,1964-01-07,"Long Beach, California, USA",Nicolas Cage was born Nicolas Kim Coppola in L...
124,nm0000126,Kevin Costner,1955-01-18,"Lynwood, California, USA","Kevin Michael Costner was born on January 18, ..."


In [7]:
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
## TODO change desired_var to desired lookup string, ethnicity to lookup var
ethnicity = []
desired_var = "ethnicity"


### Step 3: Using a specific website. 
Using nndb to grab ethnicity. Since this website displays an array of other persons when searching, we will store the desired link to the actual person and then perform a second loop to scrape the wanted variable.

Example: searching for Carrie Fisher yields me a list of other names. I want the link to her nndb profile so I print all the available links.
 Using the inspect tool on Chrome shows me that the hyperlink I want is http://www.nndb.com/people/899/000022833/ or the 3rd link.
The first 5 links. 
>http://www.nndb.com/

>http://search.nndb.com/search/nndb.cgi?n=Carrie+Fisher&omenu=unspecified&offset=20

>http://www.nndb.com/people/899/000022833/  <- the one we want

>http://www.nndb.com/people/419/000109092/

>http://www.nndb.com/people/023/000101717/


In [8]:
# I test my initial functions on a small list of queries
queries =[ 'Carrie Fisher','Glenson Chatman','Glendon Chatman']
#            many results |   no results    |   two results
for name in queries:
    query = name.replace(' ', '+')
    URL = f"http://search.nndb.com/search/nndb.cgi?nndb=1&omenu=unspecified&query={query}"
    resp = requests.get(URL,headers)
    if resp.status_code == 200:
        soup = BeautifulSoup(resp.content, "html.parser")
        print(query)
        actorLink = '-1'
        for ind, link in enumerate(soup.findAll('a')[2:]):
            print(link) 
            if link.text == name :
                actorLink = link.get('href')
                break
        print('actorlink=',actorLink)
# firstFiveLinks = ['\n'+ i.get('href') for i in soup.findAll('a')[:5]]
# print(*firstFiveLinks)

Carrie+Fisher
<a href="http://www.nndb.com/people/899/000022833/">Carrie Fisher</a>
actorlink= http://www.nndb.com/people/899/000022833/
Glenson+Chatman
actorlink= -1
Glendon+Chatman
<a href="http://www.nndb.com/people/978/000173459/">Mary Ann Glendon</a>
<a href="http://commentary.nndb.com/submit/feedback/">Make a comment</a>
actorlink= -1


### Step 3: cont'd

Using BS4 we're going to make 2 requests to the website per name in the dataframe. The first one to find their actual profile and a second request to grab all their data.

We will use a dictionary to store the information, this will make dataframe conversion easier. If the person does not exist on the first request, we will set their dictionary entry to '-1'.

We don't want to do any data cleanup yet, a dataframe with more than 1000 entries will slow down our performance if it has to do multiple requests and cleanup.


In [30]:
i =0
links = {}
ethnDb = {}
start = time.time()
def scrapeWebsite(actors, ethnDb):
    for name in actors['name']:
        query = name
        query = query.replace(' ', '+')
        URL = f"http://search.nndb.com/search/nndb.cgi?nndb=1&omenu=unspecified&query={query}"
        print(name)
        resp = requests.get(URL,headers)
        if resp.status_code == 200:
            soup = BeautifulSoup(resp.content, "html.parser")
            # Usually 3rd link but if name DNE or we less than 3 results we don't want a list index error
            links[name] = '-1'
            for ind, link in enumerate(soup.findAll('a')[2:]):
                print(link) 
                if link.text == name :
                    links[name] = link.get('href')
                    break
            print(links[name])
            if links[name] != '-1':
                resp2 = requests.get(links[name])
                if resp2.status_code == 200:
                    soup = BeautifulSoup(resp2.content,'html.parser')
                    infoP = ''
                    for ind,text in enumerate(soup.findAll('p')):
                        # print(text.find('b',text="Race"))
                        # print(f'P {ind}:',text)
                        if 'or Ethnicity' in text.text:
                            infoP = text.text
                            # print(infoP)
                            break
                    # we clean up this line of text later
                    actorInfo = (infoP)
                    ethnDb[name] = actorInfo
            else:
                ethnDb[name] = '-1'

        
        sleep(0.5)

scrapeWebsite(actors, ethnDb)
end = time.time()
# print(end-start)
# 52 seconds for one loop
# _34_ seconds for 2 website request, 30 names
# ___ seconds for 2 for loops

In [10]:
print('FINISHED')
print(end-start)

FINISHED
24.704447984695435


### Step 4: Append new data to existing dataframe.
First I convert my ethnDb dictionary into a dataframe and join it to actors on the name variable.
Then I reset the index on actors to name.

In [11]:

df_eth = pd.DataFrame.from_dict(ethnDb,orient='index')
actors = actors.join(df_eth, on='name')
actors =actors.set_index('name')
actors.head()

Unnamed: 0_level_0,imdb_name_id,dob,pob,bio,0
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Jennifer Aniston,nm0000098,1969-02-11,"Sherman Oaks, California, USA","Jennifer Aniston was born in Sherman Oaks, Cal...","Born: 11-Feb-1969Birthplace: Sherman Oaks, CAG..."
Fairuza Balk,nm0000103,1974-05-21,"Point Reyes, California, USA","""Fairuza!"" (""Turquoise"" in Farsi), her father ...",AKA Fairuza Alejandra BalkBorn: 21-May-1974Bir...
Drew Barrymore,nm0000106,1975-02-22,"Culver City, California, USA",Since melting audiences' hearts - at the age o...,AKA Drew Blyth BarrymoreBorn: 22-Feb-1975Birth...
Nicolas Cage,nm0000115,1964-01-07,"Long Beach, California, USA",Nicolas Cage was born Nicolas Kim Coppola in L...,AKA Nicolas Kim CoppolaBorn: 7-Jan-1964Birthpl...
Kevin Costner,nm0000126,1955-01-18,"Lynwood, California, USA","Kevin Michael Costner was born on January 18, ...",AKA Kevin Michael CostnerBorn: 18-Jan-1955Birt...


In [12]:
actors =actors.rename(columns={0:'raw_ethnicity'})
actors.head()

Unnamed: 0_level_0,imdb_name_id,dob,pob,bio,raw_ethnicity
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Jennifer Aniston,nm0000098,1969-02-11,"Sherman Oaks, California, USA","Jennifer Aniston was born in Sherman Oaks, Cal...","Born: 11-Feb-1969Birthplace: Sherman Oaks, CAG..."
Fairuza Balk,nm0000103,1974-05-21,"Point Reyes, California, USA","""Fairuza!"" (""Turquoise"" in Farsi), her father ...",AKA Fairuza Alejandra BalkBorn: 21-May-1974Bir...
Drew Barrymore,nm0000106,1975-02-22,"Culver City, California, USA",Since melting audiences' hearts - at the age o...,AKA Drew Blyth BarrymoreBorn: 22-Feb-1975Birth...
Nicolas Cage,nm0000115,1964-01-07,"Long Beach, California, USA",Nicolas Cage was born Nicolas Kim Coppola in L...,AKA Nicolas Kim CoppolaBorn: 7-Jan-1964Birthpl...
Kevin Costner,nm0000126,1955-01-18,"Lynwood, California, USA","Kevin Michael Costner was born on January 18, ...",AKA Kevin Michael CostnerBorn: 18-Jan-1955Birt...


### Step 5: More data cleanup
Unfortunately, this website is not formatted consistently and does not use names to distinguish tags.
We'll have to use a combination of regex and NLTK to clean up our data.

We can print out 2 rows to see what data is stored in the 'raw_ethnicity' column. 
We can see that our desired data is either nested in between "Religion" and "Sexual Orientation" or between "Gender" and some other random word.
There are also some relevant keywords near the bottom of the page.

In [13]:
exampleStr1 = actors.loc['Val Kilmer', 'raw_ethnicity']
exampleStr2 = actors.loc['Jennifer Tilly','raw_ethnicity']
print(exampleStr1,exampleStr2)


AKA Val Edward KilmerBorn: 31-Dec-1959Birthplace: Los Angeles, CAGender: MaleReligion: Christian ScienceRace or Ethnicity: WhiteSexual orientation: StraightOccupation: ActorNationality: United StatesExecutive summary: Real GeniusGirlfriend: Mare WinninghamGirlfriend: CherGirlfriend: Ellen Barkin (dated 1984)Wife: Joanne Whalley (actress, m. 28-Feb-1988, div. 1996, one daughter, one son)Daughter: Mercedes (b. 1992)Son: Jack (b. 1995)Girlfriend: Michelle PfeifferGirlfriend: Cindy Crawford (dated 1996)Girlfriend: Jaycee Gossett (dated 1998-2000)Girlfriend: Daryl Hannah (dated 2001-02)Girlfriend: Elisabeth Shue (uncertain)Girlfriend: Angelina Jolie (uncertain)    High School: Chatsworth High School, Chatsworth, CA (1977)    Cherokee Ancestry 
    Risk Factors: Smoking

    FILMOGRAPHY AS ACTOR    The Super (14-Oct-2017)     The Snowman (7-Oct-2017)     Song to Song (10-Mar-2017)     The Spoils of Babylon (9-Jan-2014)     Palo Alto (29-Aug-2013)     Planes (9-Aug-2013) [VOICE]    Riddle (10

### Contd:
We make 2 functions, one to clean the word from strays and whitespace.
The other uses sent_tokenize and word_tokenize to split strings by sentence or words. I also use stopwords.words() to remove filler words such as 'i', 'he', etc.

TO show off some of the function re-writes I had to do, here is the 2nd version of clean data. Using split was very unreliable as I soon learned that the website entries were not formatted consistently. Some of my 'clean' entries were completely wrong.

In [14]:
# version 2
def version2(s):
    ethnicities = []
    sentences = nltk.sent_tokenize(s)
    lessWord = [s for s in sentences if not s in stopwords.words()]
    # the data we needed wouldn't always be before Sexual Orientation
    part1 = lessWord[0].split(' orientation')[0].split('Ethnicity:')[-1]
    ancestry = ''
    for sent in lessWord:
        if 'Ancestry' in sent:
            ancestry = sent
            break
    ancestry = ancestry.split('Risk')[0]
    ancestry = ancestry.split("\xa0 \xa0 ")
    if len(ancestry[-1]) > 0:
        ancestry = ancestry[-1]
    else:
        ancestry = ancestry[-2]
    
    ethnicities.append(cleanWord(part1))
    ethnicities.append(cleanWord(ancestry))
    return ethnicities

In [21]:

def cleanWord(s):
    if len(s) <2:
        return s
    removeWhite = s.strip()
    removeWhite = removeWhite.replace('Sexual','')
    removeWhite = removeWhite.replace('Ancestry','')
    removeWhite = removeWhite.replace('Race or Ethnicity:','')
    # print(removeWhite)
    return(removeWhite.strip())

### Step 5 cont'd:
To make your own regex, you can use regex101.com.

In [22]:
def cleanData(s):
    if s == '-1':
        return s
    ethnicities = []
    sentences = nltk.sent_tokenize(s)
    lessWord = [s for s in sentences if not s in stopwords.words()]

    regex1 = re.compile(r'Race or Ethnicity: [A-Z]([a-z])*')
    regex2 = re.compile(r'[A-Z]([a-z])* Ancestry')

    part1 = re.finditer(regex1,lessWord[0])
    for match in part1:
        ethnicities.append(cleanWord(match.group()))
    ancestry = ''
    for sent in lessWord:
        if 'Ancestry' in sent:
            ancestry = sent
            break
    matches = re.finditer(regex2, ancestry)
    for match in matches:
        ethnicities.append(cleanWord(match.group()))
    
    # ethnicities.append(cleanWord(part1))
    # ethnicities.append(cleanWord(ancestry))
    return ethnicities
# tokenWords(exampleStr2)

### Step 5: Con't
Now we do this to all strings in the ethnicity column. It's easier to convert into a function and apply it to the dataframe.

I like to make a copy of the results in case I want to alter my functions.


In [23]:
copy = actors['raw_ethnicity'].apply(cleanData)
copy

name
Jennifer Aniston                        [White, Greek]
Fairuza Balk                                   [White]
Drew Barrymore                      [White, Hungarian]
Nicolas Cage                                   [White]
Kevin Costner                   [White, German, Irish]
Jamie Lee Curtis                               [White]
Bo Derek                                       [White]
Leonardo DiCaprio                                   -1
Cameron Diaz           [White, Cuban, English, German]
Erika Eleniak                                       -1
Jodie Foster                                   [White]
Gina Gershon           [White, French, Russian, Dutch]
Tom Hanks                                      [White]
Teri Hatcher                                   [White]
Ethan Hawke                                    [White]
Helen Hunt                             [White, Jewish]
Ashley Judd                           [White, Italian]
Val Kilmer                           [White, Cherokee]
Heath

In [28]:
# to view all our unique values, we use unique but we add explode() because our data type is a list.
copy.explode().unique() 

array(['White', 'Greek', 'Hungarian', 'German', 'Irish', '-1', 'Cuban',
       'English', 'French', 'Russian', 'Dutch', 'Jewish', 'Italian',
       'Cherokee', 'Scottish', 'Multiracial', 'Chinese'], dtype=object)

### Step 6: Additional Cleanup 

I want to see which rows have a -1 value.
Since Leonardo DiCaprio is a famous name, I know that he must have a page that wasn't properly requested.


In [65]:
copy.loc[copy[:] == '-1']

name
Leonardo DiCaprio    -1
Erika Eleniak        -1
Renée Zellweger      -1
Name: raw_ethnicity, dtype: object

In [72]:
queries = pd.DataFrame(copy.loc[copy[:] == '-1'])
queries.reset_index(level=0, inplace=True)
scrapeWebsite(queries, ethnDb)

Leonardo DiCaprio
<a href="http://www.nndb.com/people/813/000029726/">Leonardo Da Vinci</a>
<a href="http://www.nndb.com/people/922/000095637/">Leonardo Fibonacci</a>
<a href="http://www.nndb.com/people/253/000282404/">Leonardo Cimino</a>
<a href="http://www.nndb.com/people/901/000163412/">Bar Refaeli</a>
<a href="http://www.nndb.com/people/971/000044839/">Rocco DiSpirito</a>
<a href="http://www.nndb.com/people/869/000089602/">Leo von Caprivi</a>
<a href="http://www.nndb.com/people/149/000170636/">John H. Dasburg</a>
<a href="http://www.nndb.com/people/931/000174409/">John P. DesBarres</a>
<a href="http://www.nndb.com/people/514/000092238/">Camillo Benso di Cavour</a>
<a href="http://www.nndb.com/people/611/000093332/">Josquin Des Prez</a>
<a href="http://www.nndb.com/people/401/000403189/">Pierre de Coubertin</a>
<a href="http://www.nndb.com/people/072/000130679/">Michael Des Barres</a>
<a href="http://www.nndb.com/people/487/000097196/">Bonaventure des Périers</a>
<a href="http://com

### Step 7: last fixes (optional)
We can see that because scrapeWebsite only looks at and after the 3rd <a>, it misses it's rightful link. 
We can either look at all <a> tags or start from the 2nd one.
FIXED scrapeWebsite below.

You can run this function on the rest of the queries.

In [71]:
def scrapeWebsiteFixed(actors, ethnDb):
    for name in actors['name']:
        query = name
        query = query.replace(' ', '+')
        URL = f"http://search.nndb.com/search/nndb.cgi?nndb=1&omenu=unspecified&query={query}"
        print(name)
        resp = requests.get(URL,headers)
        if resp.status_code == 200:
            soup = BeautifulSoup(resp.content, "html.parser")
            # Usually 3rd link but if name DNE or we less than 3 results we don't want a list index error
            links[name] = '-1'
            for ind, link in enumerate(soup.findAll('a')[1:]):
                print(link) 
                if link.text == name :
                    links[name] = link.get('href')
                    break
            print(links[name])
            if links[name] != '-1':
                resp2 = requests.get(links[name])
                if resp2.status_code == 200:
                    soup = BeautifulSoup(resp2.content,'html.parser')
                    infoP = ''
                    for ind,text in enumerate(soup.findAll('p')):
                        if 'or Ethnicity' in text.text:
                            infoP = text.text
                            # print(infoP)
                            break
                    # we clean up this line of text later
                    actorInfo = (infoP)
                    ethnDb[name] = actorInfo
            else:
                ethnDb[name] = '-1'

        
        sleep(0.5)


Unnamed: 0,name,raw_ethnicity
0,Leonardo DiCaprio,-1
1,Erika Eleniak,-1
2,Renée Zellweger,-1


### Final Step: Add our edited dataframe

In [19]:
# export as csv in case work is lost
df2.to_csv('uncleanedData.csv')


NameError: name 'df2' is not defined