# Parsing meme site

1. Import all the necessary libraries.

In [43]:
import numpy as np
import pandas as pd
import time

In [31]:
from requests import get
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

2. Creating a function that get a list of meme links from one page

In [32]:
def getPageLinks(page_num):
    
    page_link = f'https://knowyourmeme.com/page/{page_num}'
    response = get(page_link, headers={'User-Agent': UserAgent().chrome})
    if not response.ok:
        return []
    
    html = response.content
    soup = BeautifulSoup(html, 'html.parser')
    
    meme_links = soup.find_all('a', {'class': 'newsfeed-title'})
    meme_links = ['https://knowyourmeme.com/' + link.get('href') for link in meme_links]
    
    return meme_links
    

3. Creating a function that gets a statistics data for single meme<br>(views, videos, photos, comments)

In [33]:
def getStats(soup, stats):
    try:
        obj = soup.find('dd', attrs={'class': stats})
        obj = obj.find('a').text
        obj = int(obj.replace(',', ''))
    except:
        obj=None
        
    return obj

4. Creating a function that gets properties data for single meme<br>(name, status, type, origin_year, origin place, tags)

In [34]:
def getProperties(soup):
    
    meme_name = soup.find('section', attrs={'class':'info'}).find('h1').text.strip()
    
    properties = soup.find('aside', attrs={'class':'left'})
    meme_status = properties.find('dd')
    meme_status = '' if not meme_status else meme_status.text.strip()
    
    meme_type = properties.find('a', attrs={'class':'entry-type-link'})
    meme_type = '' if not meme_type else meme_type.text.strip()
    
    meme_origin_year = properties.find(text='\nYear\n')
    meme_origin_year = '' if not meme_origin_year else meme_origin_year.parent.find_next()
    meme_origin_year = meme_origin_year.text.strip()
    
    meme_origin_place = properties.find('dd', attrs={'class':'entry_origin_link'})
    meme_origin_place = '' if not meme_origin_place else meme_origin_place.text.strip()
    
    meme_tags = properties.find('dl', attrs={'id':'entry_tags'}).find('dd')
    meme_tags = '' if not meme_tags else meme_tags.text.strip()
    
    return meme_name, meme_status, meme_type, meme_origin_year, meme_origin_place, meme_tags
    

5. Function for getting all the text elements of the single meme page

In [35]:
def getText(soup):
    
    body = soup.find('section', attrs={'class':'bodycopy'})
    meme_about = body.find('p')
    meme_about = '' if not meme_about else meme_about.text
    
    meme_origin = body.find(text='Origin') or body.find(text='History')
    meme_origin = '' if not meme_origin else meme_origin.parent.find_next().text
    
    if body.text:
        other_text = body.text.strip().split('\n')[4:]
        other_text = " ".join(other_text).strip()
    else:
        other_text = ""
    
    return meme_about, meme_origin, other_text
    

6. Combining getStats, getProperties, getText in one function

In [36]:
def getMemeData(meme_page):

    response = get(meme_page, headers={'User-Agent': UserAgent().chrome})
    
    if not response.ok:
        return response.status_code
    
    html = response.content
    soup = BeautifulSoup(html,'html.parser')

    views = getStats(soup=soup, stats='views')
    videos = getStats(soup=soup, stats='videos')
    photos = getStats(soup=soup, stats='photos')
    comments = getStats(soup=soup, stats='comments')

    date = soup.find('abbr', attrs={'class':'timeago'}).attrs['title']

    meme_name, meme_status, meme_type, meme_origin_year, meme_origin_place, meme_tags =\
    getProperties(soup=soup)

    meme_about, meme_origin, other_text = getText(soup=soup)

    data_row = {"name":meme_name, "status":meme_status, 
                "type":meme_type, "origin_year":meme_origin_year, 
                "origin_place":meme_origin_place,
                "date_added":date, "views":views, 
                "videos":videos, "photos":photos, "comments":comments, "tags":meme_tags,
                "about":meme_about, "origin":meme_origin, "other_text":other_text}

    return data_row

7. Checking getMemeData function on a separate meme page

In [37]:
data_row = getMemeData('http://knowyourmeme.com/memes/doge')
data_row

{'name': 'Doge',
 'status': 'Confirmed',
 'type': 'Animal',
 'origin_year': '2010',
 'origin_place': 'Tumblr',
 'date_added': '2023-04-03T13:44:36-04:00',
 'views': 14042165,
 'videos': 104,
 'photos': 1793,
 'comments': 923,
 'tags': 'animal, dog, shiba inu, shibe, such doge, super shibe, japanese, tumblr, comic sans, photoshop meme, doges, dogges, reddit, bitcoin, dogecoin, canine, doge meme, atsuko sato, kabosu, doge memes, dogelore, kabosumama',
 'about': 'Doge (pronounced /ˈdoʊdʒ/ DOHJ) is a slang term for "dog" that is primarily associated with pictures of Shiba Inus (nicknamed "Shibe") and internal monologue captions on Tumblr. These photos may be photoshopped to change the dog\'s face or captioned with interior monologues in Comic Sans font. The primary meme and iconography associated with Doge is the Shiba Inu named Kabosu, whose photos taken by her owner Atsuko Sato in early 2010 went viral across the internet, spawning numerous memes and larger trends in the following decade

Let’s add received data in a dataframe.

In [38]:
final_df = pd.DataFrame(columns=['name', 'status', 'type', 'origin_year', 'origin_place',
                                 'date_added', 'views', 'videos', 'photos', 'comments', 
                                 'tags', 'about', 'origin', 'other_text'])

In [39]:
final_df = final_df.append(data_row, ignore_index=True)

In [40]:
final_df

Unnamed: 0,name,status,type,origin_year,origin_place,date_added,views,videos,photos,comments,tags,about,origin,other_text
0,Doge,Confirmed,Animal,2010,Tumblr,2023-04-03T13:44:36-04:00,14042165,104,1793,923,"animal, dog, shiba inu, shibe, such doge, supe...",Doge (pronounced /ˈdoʊdʒ/ DOHJ) is a slang ter...,"The use of the misspelled word ""doge"" to refer...","Identity On February 13th, 2010, Japanese kind..."


Now we can test getPageLinks and getMemeData functions on several pages,<br>adding an appropriate information to the dataframe.

In [41]:
from tqdm import tqdm_notebook

final_df = pd.DataFrame(columns=['name', 'status', 'type', 'origin_year', 'origin_place',
                                 'date_added', 'views', 'videos', 'photos', 'comments', 
                                 'tags', 'about', 'origin', 'other_text'])

for page_number in tqdm_notebook(range(5), desc='Pages'):
    
    meme_links = getPageLinks(page_number)
    
    for meme_link in tqdm_notebook(meme_links, desc='Memes', leave=False):
        
        for i in range(5):
            try:
                data_row = getMemeData(meme_link)
                final_df = final_df.append(data_row, ignore_index=True)
                break
            except:
                print('Warning! Parsing once again:', meme_link)
                continue


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for page_number in tqdm_notebook(range(5), desc='Pages'):


Pages:   0%|          | 0/5 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for meme_link in tqdm_notebook(meme_links, desc='Memes', leave=False):


Memes:   0%|          | 0/5 [00:00<?, ?it/s]



Memes:   0%|          | 0/5 [00:00<?, ?it/s]



Memes:   0%|          | 0/5 [00:00<?, ?it/s]



Memes:   0%|          | 0/5 [00:00<?, ?it/s]



Memes:   0%|          | 0/5 [00:00<?, ?it/s]



In [42]:
final_df.head()

Unnamed: 0,name,status,type,origin_year,origin_place,date_added,views,videos,photos,comments,tags,about,origin,other_text
0,Florp,Submission,Character,2022,Ultrakill,2023-04-07T16:48:40-04:00,128,1,11,0,"unltrakill, ultrakill meme, hampter, hampter u...",Florp is an easter egg item inspired by the ha...,Florp would not appear in Ultrakill until an u...,"On August 16th, 2022, Ultrakill update 11 was ..."
1,Little People Match Cuts,Submission,Parody,2021,TikTok,2023-04-07T17:22:46-04:00,54,17,0,0,"match cut, little people, little person, dwarf...",Little People Match Cuts refers to videos of p...,While the exact origin of the trend is unknown...,https://www.tiktok.com/embed/v2/69269944062727...
2,Old Filter (TikTok),Submission,Participatory Media,2023,TikTok,2023-04-07T15:17:11-04:00,22,3,0,0,"old filter, tiktok old filter, old face filter...","The Old Filter (TikTok), also known as the Coo...","On April 5th, 2023, TikToker[1] @thatgirlfromw...",@thatgirlfromwork Brb breaking up with him ♬ ...
3,2023 FIFA World Cup U-20 Indonesia Cancellation,Submission,Competition,2023,FIFA,2023-04-07T08:30:02-04:00,167,0,2,0,"fifa, football, world cup, u-20, under 20, ind...",2023 FIFA World Cup U-20 Indonesia Cancellatio...,,FIFA removes Indonesia as host of FIFA U-20 Wo...
4,Florp,Submission,Character,2022,Ultrakill,2023-04-07T16:48:40-04:00,127,1,11,0,"unltrakill, ultrakill meme, hampter, hampter u...",Florp is an easter egg item inspired by the ha...,Florp would not appear in Ultrakill until an u...,"On August 16th, 2022, Ultrakill update 11 was ..."
