Say for example you wanted to do analysis of movies from IMDB. IMDB doesn't expose an official API for accessing that information. There are unofficial APIs that query IMDB's database, but let's see how to do it ourselves

In [None]:
# requests for fetching html of website
import requests

# Make the request to a url
r = requests.get('https://www.imdb.com/search/title/?count=100&release_date=2016,2016&title_type=feature')

# Create soup from content of request
c = r.content

#The docs for BeautifulSoup can be found on https://www.crummy.com/software/BeautifulSoup/bs4/doc/
from bs4 import BeautifulSoup

soup = BeautifulSoup(c)

In [None]:
#contains the entire HTML content of the webpage
soup

In [None]:
ranks = soup.find("span", class_="text-primary")
ranks



Oops, this seems to give us only the first thing matching all criteria. What if we want all of them?

In [None]:
ranks = soup.find_all("span", class_="text-primary")
print(ranks)

If you're more comfortable writing CSS selectors, BeautifulSoup provides Select, a function which lets you write arbitrarily complex CSS

In [None]:
#note that there is no space here. Adding the space changes the meaning to "find all child elements of a span which have the text-primary class"
ranks = soup.select("span.text-primary")
print(ranks)

In [None]:
def getTextFromElement(tag):
  return tag.string

ranks_string = map(getTextFromElement,ranks)

list(ranks_string)

In [None]:
def getIntegerFromString(str):
  return int(float(str))

#that isn't really what we want, let's make it numeric
def getIntegerFromElement(tag):
  #tag.string gets us the text from the current element
  #float parses it from a string to a numeric type (specifially a float which allows decimals)
  #int parses the float into a number without a decimal point
  return getIntegerFromString(tag.string)

ranks_int = list(map(getIntegerFromElement,ranks))

#if this was too long, we could just print len(list(ranks_int)) to make sure we have the right size
print(ranks_int)
print(type(ranks_int[0]))


Now let's move onto titles

In [None]:
titles = list(map(getTextFromElement,soup.select(".lister-item-header a")))

print(len(titles))
print(titles)
print(type(titles[0]))

In [None]:
descriptions = list(map(getTextFromElement,soup.select(".ratings-bar + .text-muted")))

print(len(descriptions))
print(descriptions)

There's some characters in front of the description. Let's clean those up

In [None]:
#if you want to see a visual explanation of what a regex does, use https://regex101.com/ and select the "python" flavor
import re


newLineRegex = re.compile("\\n") #We want to find the string "\n". regex needs to replace that backslash so it becomes \\.
#we can also use r"\n" to not have to worry about escaping the backslashes and have the python interpreter substitute it on our behalf

def replaceNewLine(tagValue):
  return newLineRegex.sub("",tagValue)

In [None]:
print(type(descriptions[0]))
print(type(descriptions[0].string))
print(type(titles[0]))

def convertTagToString(tag):
  #Advanced: UTF-8 is one way to specify the string type. This line converts NavigableString to the bytes type and then decodes the bytes into the python native string type 'str'
  return str(tag).encode('utf-8').decode("utf-8") 

descriptionStrings = list(map(convertTagToString, descriptions))

print(type(descriptionStrings[0]))
print(descriptionStrings[0])

In [None]:
updatedDescriptions = list(map(replaceNewLine, descriptionStrings))
print(updatedDescriptions)

In [None]:
extraSpaceRegex = re.compile(r"\s{2,}") #Advanced: look for two or more spaces

def replaceExtraSpaces(tagValue):
  return extraSpaceRegex.sub("",tagValue)

In [None]:
cleanDescriptions = list(map(replaceExtraSpaces,updatedDescriptions))
print(cleanDescriptions)

Let's grab runtimes now

In [None]:
runtimeDataRaw = list(map(getTextFromElement, soup.select(".text-muted .runtime")))
print(runtimeDataRaw[0])
print(len(runtimeDataRaw))

In [None]:
#Advanced: find one or more spaces followed by the text 'min'
runtimeRegex=re.compile("\s+min")

def cleanRuntime(tag):
  return runtimeRegex.sub("",tag)

stringRuntimes = list(map(convertTagToString, runtimeDataRaw))
runtimes = list(map(cleanRuntime, stringRuntimes))

print(runtimes[0])
print(len(runtimes))

numericRuntimes = list(map(getIntegerFromString, runtimes))
print(type(numericRuntimes[0]))

In [None]:
rawGenres = list(map(getTextFromElement, soup.select(".genre")))
print(rawGenres[0])
print(len(rawGenres))

There's a world where we could create a column for each genre type so a movie could have multiple genres. For simplicity, let's take the first one

In [None]:
genreStrings = list(map(convertTagToString, rawGenres))
genreNoNewLine = list(map(replaceNewLine, genreStrings))
print(genreNoNewLine)

In [None]:
allSpaceRegex = re.compile("\s+")

def removeAllSpaces(str):
  return allSpaceRegex.sub("", str)
  
genreList = list(map(removeAllSpaces,genreNoNewLine))

print(genreList)

In [None]:
#Advanced: period is a wildcard character in regex, so this says match a comma followed by 0 or more characters of any kind
firstElementRegex = re.compile(",.*")

def firstElementOnly(str):
  return firstElementRegex.sub("",str)

singleGenreList = list(map(firstElementOnly,genreList))
print(singleGenreList)

In [None]:
#now let's make it categorical
import pandas as pd

genreSeries = pd.Series(singleGenreList, dtype='category')

genreSeries

If you want to practice, try getting the rating data into numeric form (using CSS selector '.ratings-imdb-rating strong') and votes into numeric form (using CSS selector '.sort-num_votes-visible span:nth-child(2)').

You can also practice categorical data with actors ('.lister-item-content .ghost+ a') and cirectors ('.text-muted+ p a:nth-child(1)')

In [None]:
#Metascores
rawMetascores = list(map(getTextFromElement, soup.select('.metascore')))
print(rawMetascores)

"""
fun syntax trick: instead of using Map, we can use list comprehension in Python instead

The structure is expression (e.g. what you want to do for each element) for temporaryVariableName (representing an individual element) in collection (your original list)
"""
metaScoreStrings = [convertTagToString(tag) for tag in rawMetascores]

#you can make your expressions arbitrarily complex -> it gets harder to read but allows you to iterate over your dataset once
numericMetascores = [getIntegerFromString(removeAllSpaces(convertTagToString(tag))) for tag in rawMetascores]
print(numericMetascores)
print(len(numericMetascores))

Uh oh, this list doesn't have the same length as the others. How did this happen? What can we do about it? What would happen if we attempted to make a dataset with this vector?

In [None]:
rawGrossData = [convertTagToString(getTextFromElement(s)) for s in soup.select(".ghost ~ .text-muted + span")]
"""
Advanced: ^ signifies the start of the string and $ signifies the end of the string. Regex can search anywhere in the string, this specifies that it must match the entire string.
This regex says the string must start with a "$", capture any number of characters, then it must end with an "M".
Findall returns a list in case the regex matches the string multiple times or has multiple groups, so we simply take the first (and only in this case) element
"""
grossDataNumericString = [re.findall(r"^\$(.*)M$", s)[0] for s in rawGrossData]
grossDataNumeric = [float(s) for s in grossDataNumericString]
print(grossDataNumeric)
print(len(grossDataNumeric))

One way to fill the data is to go back, look at the source, and manually determine which elements are missing data. This obviously doesn't scale for large datasets, but let's do this as an example to make the data element be the same size as everything else and yet correct

In [None]:
#these are the movie numbers from 1-100

"""
everything is 0 indexed so if we want to replace the 36th element from 1-100 
(e.g. index 35 in our list), we want to 
1. grab elements 0-34, 
2. append the None at place 35
3. and then add the rest of the data after
"""
indicesMissingGross = [20, 60, 61, 74, 80,84,87,97]

for i in indicesMissingGross:
  #remember that when subsetting, the first parameter is inclusive and the second is exclusive
  a = grossDataNumeric[0:(i-1)]
  #len of data is always one more than the indices in the list, but since the last parameter is exclusive, we wont hit an exception 
  b = grossDataNumeric[(i-1):len(grossDataNumeric)]
  #append adds the parameter to the end of the list
  a.append(None)
  #if we called append here, it would add the entire list to the next position e.g. [1,2,3,[4,5,6,....]]. Extend does what we want i.e. smash the two lists into 1
  a.extend(b)
  #update grossDataNumeric to be the new data. Next time a will be longer than in this iteration
  grossDataNumeric = a
  
print(grossDataNumeric)
print(len(grossDataNumeric))

In [None]:
imdbData = pd.concat([pd.Series(ranks_int,name='rank'),pd.Series(titles,name='title'),pd.Series(cleanDescriptions,name='description'),pd.Series(numericRuntimes,name='runtime'), pd.Series(genreSeries,name='genre'),pd.Series(grossDataNumeric,name='Gross')], axis=1)
imdbData

In [None]:
#try iterating over each movie element and pull only the relevant fields

movieTags = soup.select(".lister-item.mode-advanced")

movies=[]

for masterTag in movieTags:
  #this is the same selector we used in the first element
  rawRank = masterTag.select("span.text-primary")
  rawTitle = masterTag.select(".lister-item-header a")

  #select returns a list. We know this will only return one, so we can hardcode it to check that the length of the list is 1. Attempting to access an element in an empty list will cause an exception, hence why we check
  if(len(rawRank) == 1):
    rank = getIntegerFromElement(rawRank[0])
    
  else:
    rank = None

  if(len(rawTitle) == 1):
    title = getTextFromElement(rawTitle[0])
  else:
      title = None
  #we can keep adding in other elements as well. For clarity's sake, it's probably better to define a function for each data element we want and pass the tag in
  movies.append([rank, title])

print(len(movies))
print(movies)

You can check out cool visualizations from this dataset at https://www.analyticsvidhya.com/blog/2017/03/beginners-guide-on-web-scraping-in-r-using-rvest-with-hands-on-knowledge/