Inspired by Kim's Convenience, a Canadian television sitcom about the misadventures of a Korean-Canadian family running a convenience store, I would like to create a data set of websites from Google on Kim's Convenience, in which would contain the followings:
-  websites' urls
-  images (urls) --max. amount of 5
-  sentences containing the main characters of the show --max. amount of 100
-  no. of outgoing links 
-  website last-modified (from header)


In [58]:
#Import packages
import json
import requests
from bs4 import BeautifulSoup
import re
from nltk.tokenize import sent_tokenize, word_tokenize
import random
import pandas as pd 
import numpy as np

In [29]:
#Use Google Search API
try: 
    from googlesearch import search 
except ImportError:  
    print("No module named 'google' found") 
  
# keyword to search
keyword = "Kim's Convenience"

#create an empty list for the urls
Kim_Convenience_urls = []

#Gather 20 urls and add them to the urls list
for j in search(keyword, tld="com", lang='en', num=20, stop=20, pause=2): 
    Kim_Convenience_urls.append(j)

#print the url list
print(Kim_Convenience_urls) 

['https://en.wikipedia.org/wiki/Kim%27s_Convenience', 'https://en.wikipedia.org/wiki/List_of_Kim%27s_Convenience_episodes', 'https://en.wikipedia.org/wiki/Kim%27s_Convenience_(play)', 'https://en.wikipedia.org/wiki/Andrea_Bang', 'https://en.wikipedia.org/wiki/Paul_Sun-Hyung_Lee', 'https://www.netflix.com/title/80199128', 'https://www.imdb.com/title/tt5912064/', 'https://www.cbc.ca/kimsconvenience/m_site/', 'https://twitter.com/KimsConvenience?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor', 'https://twitter.com/KimsConvenience/status/1313588471282401281?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Etweet', 'https://twitter.com/KimsConvenience/status/1313196444917137408?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Etweet', 'https://twitter.com/KimsConvenience/status/1312810724025159681?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Etweet', 'https://www.youtube.com/watch?v=7_BT3jdkUPg', 'https://www.npr.org/2019/01/09/682888290/kim-s-convenience-is-a-sitcom-about-asian-immig

In [30]:
# Get image links from each of the urls above
image_urls = []

for i in Kim_Convenience_urls:
    source = requests.get(i).text
    bs = BeautifulSoup(source, 'html.parser')
    images = bs.find_all('img', {'src':re.compile('([-\w]+\.(?:jpg|jpeg|png))')})
    links = []
    for image in images: 
        links.append(image['src'])
    image_urls.append(links)
    
print(image_urls[:5])

[['//upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Kim%27s_Convenience_logo.svg/250px-Kim%27s_Convenience_logo.svg.png', '//upload.wikimedia.org/wikipedia/commons/thumb/9/98/Kim%27s_convenience_toronto.jpg/332px-Kim%27s_convenience_toronto.jpg', '//upload.wikimedia.org/wikipedia/commons/thumb/8/8c/Blank_television_set.svg/32px-Blank_television_set.svg.png', '//upload.wikimedia.org/wikipedia/en/thumb/c/cf/Flag_of_Canada.svg/32px-Flag_of_Canada.svg.png', '//upload.wikimedia.org/wikipedia/en/thumb/4/4a/Commons-logo.svg/30px-Commons-logo.svg.png', '/static/images/footer/wikimedia-button.png', '/static/images/footer/poweredby_mediawiki_88x31.png'], ['/static/images/footer/wikimedia-button.png', '/static/images/footer/poweredby_mediawiki_88x31.png'], ['/static/images/footer/wikimedia-button.png', '/static/images/footer/poweredby_mediawiki_88x31.png'], ['//upload.wikimedia.org/wikipedia/en/thumb/8/8a/OOjs_UI_icon_edit-ltr-progressive.svg/10px-OOjs_UI_icon_edit-ltr-progressive.svg.png', '/

In [31]:
# Find sentences that contain any of the main characters of Kim's Convenience from each of the websites
sentences_list = []
keys = ['Mr. Kim', 'Appa', 'Mrs. Kim', 'Umma', 'Jung', 'Janet']

for i in Kim_Convenience_urls:
    source = requests.get(i)
    sentences = []
    text_blobs = re.findall("[a-zA-Z0-9 .,-]+", str(source.content))
    for text_blob in text_blobs:
        # sentences must have at least one space; filtering for sentences with one or more space
      if " " not in text_blob:
        continue
      sents = sent_tokenize(str(text_blob)) 
      if len(sents) < 2:
        continue 
      for sent in sents:
        k = [ w for w in keys if w in sent ]
        # only keeping the sentences that contain at least one of the keywords
        if len(k) > 0: 
            sentences.append(sent) 
    sentences_list.append(sentences)


print(sentences_list[:5])

[['Other characters include Jung', 'Mr. Kim is traditional, proud and stubborn, practical, opinionated and blunt.', 'He is 56 years old at the start of the series and estranged from his son Jung, a frayed relationship he gradually attempts to mend beginning in Season 2.', 'She is the family member in most frequent contact with Jung.', ' as Jung Kim, 24 at the start of the series.', 'Mr. and Mrs. Kim', 'Jung works at Handy Car Rental, where he is promoted to assistant manager at the beginning of the series but returns to his former job after an attempt at moving to another company is scuttled.', 'He was kicked out of the family home by Mr. Kim after stealing from him and remains estranged from his father, though he is still in contact with the rest of the family, and gradually reconnects with his father over the course of the series.', 'Jung', 'He is promoted to assistant manager after Jung initially leaves Handy Car Rental at the end of Season 2, now serving as his superior.', 'The man

In [32]:
# find number of outgoing links in each of the websites
outgoing_link_count = []

for i in Kim_Convenience_urls:
    source = requests.get(i).text
    bs = BeautifulSoup(source, 'html.parser')
    count = 0
    for link in bs.findAll('a', attrs={'href': re.compile("^http://")}):
        count += 1
        
    outgoing_link_count.append(count)
    
print(outgoing_link_count[:5])

[19, 19, 5, 4, 2]


In [33]:
# find the each of the websites' last-modified (from header)
last_modified_list = []
for i in Kim_Convenience_urls:
    r = requests.get(i, stream=True, allow_redirects=True)
    header = r.headers
    #check to see if there's the "last-Modified" key in the header
    if "Last-Modified" in header:
        url_time = r.headers["Last-Modified"]
    #we would mark it as "?" if there's no "last-Modified" in the header
    else:
        url_time = "?"
    last_modified_list.append(url_time)

print(last_modified_list)

['Mon, 05 Oct 2020 02:58:57 GMT', 'Mon, 05 Oct 2020 02:58:56 GMT', 'Wed, 23 Sep 2020 02:04:53 GMT', 'Wed, 30 Sep 2020 10:13:25 GMT', 'Sun, 04 Oct 2020 11:04:04 GMT', '?', '?', '?', 'Wed, 07 Oct 2020 06:36:46 GMT', 'Wed, 07 Oct 2020 06:36:46 GMT', 'Wed, 07 Oct 2020 06:36:46 GMT', 'Wed, 07 Oct 2020 06:36:46 GMT', '?', '?', '?', '?', '?', '?', '?', '?']


In [34]:
# create the initial JSON_list
JSON_list = []
for i in range(0,20):
    JSON = {
        "Images":image_urls[i],
        "Sentences":sentences_list[i],
        "num_outgoing_links":outgoing_link_count[i],
        "website_last_modified":last_modified_list[i]
    }
    
    JSON_list.append(JSON)
    
print(JSON_list[:2])

[{'Images': ['//upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Kim%27s_Convenience_logo.svg/250px-Kim%27s_Convenience_logo.svg.png', '//upload.wikimedia.org/wikipedia/commons/thumb/9/98/Kim%27s_convenience_toronto.jpg/332px-Kim%27s_convenience_toronto.jpg', '//upload.wikimedia.org/wikipedia/commons/thumb/8/8c/Blank_television_set.svg/32px-Blank_television_set.svg.png', '//upload.wikimedia.org/wikipedia/en/thumb/c/cf/Flag_of_Canada.svg/32px-Flag_of_Canada.svg.png', '//upload.wikimedia.org/wikipedia/en/thumb/4/4a/Commons-logo.svg/30px-Commons-logo.svg.png', '/static/images/footer/wikimedia-button.png', '/static/images/footer/poweredby_mediawiki_88x31.png'], 'Sentences': ['Other characters include Jung', 'Mr. Kim is traditional, proud and stubborn, practical, opinionated and blunt.', 'He is 56 years old at the start of the series and estranged from his son Jung, a frayed relationship he gradually attempts to mend beginning in Season 2.', 'She is the family member in most frequent conta

In [35]:
# will use this function to determine how many columns for sentences & images; 
# will need to downsize if they go above our desire limit (100 for sentences; 5 for images)
def find_max_list(list):
    list_len = [len(i) for i in list]
    print('No. of items: %d' % max(list_len))
    for x in list_len:
        if x == max(list_len):
            print('Index of this list:', list_len.index(x))

In [36]:
# take a look at the longest list in image_urls
find_max_list(image_urls)

No. of items: 65
Index of this list: 5


In [37]:
# take a look at the longest list in sentences_list
find_max_list(sentences_list)

No. of items: 90
Index of this list: 5


The longest list in image_urls has a length of 65; we would need to downsize at least 1 list in image part to our desired amount, 5.
<br> The longest list in sentences_list has a length of 90, which is within the our desired amount, 100; we would not need to perform downsizing on the sentence part.

In [38]:
# take a look at the list(s) in image_urls that exceed the length of 5
list_len = [len(i) for i in image_urls]
image_longer = []
print('List(s) in image_urls exceeding length of 5:')
for x in list_len:
    if x > 5:
            print('Index of this list:', list_len.index(x))
            print('Length of this list: %d \n' % x)
            image_longer.append(list_len.index(x))

List(s) in image_urls exceeding length of 5:
Index of this list: 0
Length of this list: 7 

Index of this list: 5
Length of this list: 65 

Index of this list: 6
Length of this list: 43 

Index of this list: 7
Length of this list: 11 

Index of this list: 13
Length of this list: 14 

Index of this list: 14
Length of this list: 10 

Index of this list: 19
Length of this list: 51 



In [39]:
#create a new downsized JSON object
JSON_list_downsized = JSON_list

# downsize the image lists that are over the length of 5
for i in range(0,20):
    if len(JSON_list_downsized[i]['Images']) >5:
        while len(JSON_list_downsized[i]['Images']) >5:
            a = len(JSON_list_downsized[i]['Images'])
            #randomly select the items to delete
            e = random.choice(range(0, a))
            del JSON_list_downsized[i]['Images'][e]

In [40]:
# append the image lists with "?" if their length is under 5
for i in range(0,20):
    if len(JSON_list_downsized[i]['Images'])  < 5:
        while len(JSON_list_downsized[i]['Images']) < 5:
            a = "?"
            JSON_list_downsized[i]['Images'].append(a)

In [41]:
#check if all 20 of the image list parts have been downsized/upsized to 5 (the length should now be 5)
ok_count = 0
for i in range(0,20):
    if len(JSON_list_downsized[i]['Images']) != 5:
        print('Index(s): \n')
        print(i)
    else:
        ok_count += 1
        
if ok_count == 20:
        print('All image list parts have a length of 5')

All image list parts have a length of 5


In [42]:
# flatten the downsized json object
tocsv = pd.json_normalize(JSON_list_downsized)
tocsv.head(3)

Unnamed: 0,Images,Sentences,num_outgoing_links,website_last_modified
0,[//upload.wikimedia.org/wikipedia/commons/thum...,"[Other characters include Jung, Mr. Kim is tra...",19,"Mon, 05 Oct 2020 02:58:57 GMT"
1,"[/static/images/footer/wikimedia-button.png, /...","[Additional characters include Jung, nAfter be...",19,"Mon, 05 Oct 2020 02:58:56 GMT"
2,"[/static/images/footer/wikimedia-button.png, /...",[Mr. Kim hopes the store will provide a future...,5,"Wed, 23 Sep 2020 02:04:53 GMT"


In [43]:
# flatten the json object in a better fashion (flat by sentences)
def flatten_json(json):
    out = {}

    def flatten(x, name=''):
        if isinstance(x, dict):
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 1
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(json)
    return out


In [44]:
a = 0
JSON_list_final = []
for i in JSON_list_downsized:
  for j in i['Sentences']:
      if(j == '?'):
        continue

      JSON = {
      "Sentence":j,
      "Images":JSON_list_downsized[a]['Images'],
      "num_outgoing_links":JSON_list_downsized[a]['num_outgoing_links'],
      "website_last_modified":JSON_list_downsized[a]['website_last_modified']
      }
      JSON = flatten_json(JSON)
      JSON_list_final.append(JSON)
  a = a + 1

print(JSON_list_final[:5])


[{'Sentence': 'Other characters include Jung', 'Images_1': '//upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Kim%27s_Convenience_logo.svg/250px-Kim%27s_Convenience_logo.svg.png', 'Images_2': '//upload.wikimedia.org/wikipedia/commons/thumb/9/98/Kim%27s_convenience_toronto.jpg/332px-Kim%27s_convenience_toronto.jpg', 'Images_3': '//upload.wikimedia.org/wikipedia/commons/thumb/8/8c/Blank_television_set.svg/32px-Blank_television_set.svg.png', 'Images_4': '//upload.wikimedia.org/wikipedia/en/thumb/c/cf/Flag_of_Canada.svg/32px-Flag_of_Canada.svg.png', 'Images_5': '//upload.wikimedia.org/wikipedia/en/thumb/4/4a/Commons-logo.svg/30px-Commons-logo.svg.png', 'num_outgoing_links': 19, 'website_last_modified': 'Mon, 05 Oct 2020 02:58:57 GMT'}, {'Sentence': 'Mr. Kim is traditional, proud and stubborn, practical, opinionated and blunt.', 'Images_1': '//upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Kim%27s_Convenience_logo.svg/250px-Kim%27s_Convenience_logo.svg.png', 'Images_2': '//upload.wikim

In [45]:
len(JSON_list_final)

241

In [46]:
# covert the optimized json object "JSON_list_final" and export as csv
tocsv = pd.json_normalize(JSON_list_final)
tocsv.head(5)

Unnamed: 0,Sentence,Images_1,Images_2,Images_3,Images_4,Images_5,num_outgoing_links,website_last_modified
0,Other characters include Jung,//upload.wikimedia.org/wikipedia/commons/thumb...,//upload.wikimedia.org/wikipedia/commons/thumb...,//upload.wikimedia.org/wikipedia/commons/thumb...,//upload.wikimedia.org/wikipedia/en/thumb/c/cf...,//upload.wikimedia.org/wikipedia/en/thumb/4/4a...,19,"Mon, 05 Oct 2020 02:58:57 GMT"
1,"Mr. Kim is traditional, proud and stubborn, pr...",//upload.wikimedia.org/wikipedia/commons/thumb...,//upload.wikimedia.org/wikipedia/commons/thumb...,//upload.wikimedia.org/wikipedia/commons/thumb...,//upload.wikimedia.org/wikipedia/en/thumb/c/cf...,//upload.wikimedia.org/wikipedia/en/thumb/4/4a...,19,"Mon, 05 Oct 2020 02:58:57 GMT"
2,He is 56 years old at the start of the series ...,//upload.wikimedia.org/wikipedia/commons/thumb...,//upload.wikimedia.org/wikipedia/commons/thumb...,//upload.wikimedia.org/wikipedia/commons/thumb...,//upload.wikimedia.org/wikipedia/en/thumb/c/cf...,//upload.wikimedia.org/wikipedia/en/thumb/4/4a...,19,"Mon, 05 Oct 2020 02:58:57 GMT"
3,She is the family member in most frequent cont...,//upload.wikimedia.org/wikipedia/commons/thumb...,//upload.wikimedia.org/wikipedia/commons/thumb...,//upload.wikimedia.org/wikipedia/commons/thumb...,//upload.wikimedia.org/wikipedia/en/thumb/c/cf...,//upload.wikimedia.org/wikipedia/en/thumb/4/4a...,19,"Mon, 05 Oct 2020 02:58:57 GMT"
4,"as Jung Kim, 24 at the start of the series.",//upload.wikimedia.org/wikipedia/commons/thumb...,//upload.wikimedia.org/wikipedia/commons/thumb...,//upload.wikimedia.org/wikipedia/commons/thumb...,//upload.wikimedia.org/wikipedia/en/thumb/c/cf...,//upload.wikimedia.org/wikipedia/en/thumb/4/4a...,19,"Mon, 05 Oct 2020 02:58:57 GMT"


In [47]:
# export as csv
tocsv.to_csv('Web Scraping_KimsConvenience_Google.csv')