# Extracting data from cotributors in DataCamp Blog

## Import packages

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
import time

## Creating functions

In [15]:
def find_last_page(article_type):

    """
    Function to automatically detect last page of contents in DataCamp blog

    Parameters:
    - article_type: either blog page or tutorial page
    """
    
    if article_type == 'blog':
    
        url = "https://www.datacamp.com/blog/"
        webdriver.get(url)
        time.sleep(2)
        soup = BeautifulSoup(webdriver.page_source, 'html.parser')
        last_page = soup.find_all('a', class_='css-1bpjjjp')[-1].text
        
    elif article_type == 'tutorial':
        url = "https://www.datacamp.com/tutorial/"
        webdriver.get(url)
        time.sleep(2)
        soup = BeautifulSoup(webdriver.page_source, 'html.parser')
        last_page = soup.find_all('a', class_='css-1bpjjjp')[-1].text
    
    return int(last_page) 


def datacamp_scraping(writer, article_type):
    
    """
    Scrap articles from a DataCamp contributor in DataCamp's Official Blog

    Arguments:
    - writer: name of the writer (as written in the Blog)
    - article_type: either "blog" or "tutorial"
    """
    
    url_base='https://www.datacamp.com'
    title = []
    article_url = []
    topic=[]
    type_article = []
    article_date = []
    
    if article_type == 'blog':
        
        last_page= find_last_page(article_type = article_type)
        
        for page in range (1, last_page +1):
            
            url = "https://www.datacamp.com/blog/page/" + str(page)
            webdriver.get(url)
            time.sleep(2)
            soup = BeautifulSoup(webdriver.page_source, 'html.parser')
            articles = soup.find('div', class_='css-119qrmy')
            list_articles = articles.find_all('div', class_='css-8p022j')
            
            for article in list_articles:
                author = article.find('p', class_='css-198tbf7').text.strip()
                if author == writer:
                    title.append(article.find('h2',class_='css-1yr1rb9').text)
                    article_url.append(url_base + article.find('a',class_='css-yhqmm5').attrs['href'])
                    topic.append(article.find('a', class_='css-xlmvza').text)
                    type_article.append('blog')
                    article_date.append(article.find('p',class_='css-xj3esj').text)
    
    elif article_type =='tutorial':

        last_page= find_last_page(article_type = article_type)

        for page in range (1, last_page +1):
            url = "https://www.datacamp.com/tutorial/page/" + str(page)
            webdriver.get(url)
            time.sleep(3)
            soup = BeautifulSoup(webdriver.page_source, 'html.parser')
            articles = soup.find('div', class_='css-119qrmy')
            list_articles = articles.find_all('div', class_='css-8p022j')
            for article in list_articles:
                author = article.find('p', class_='css-198tbf7').text.strip()
                if author == writer:
                    title.append(article.find('h2',class_='css-1yr1rb9').text)
                    article_url.append(url_base + article.find('a',class_='css-yhqmm5').attrs['href'])
                    topic.append(article.find('a', class_='css-xlmvza').text)
                    type_article.append('tutorial')
                    article_date.append(article.find('p',class_='css-xj3esj').text)
                    
    # Convert scrapped data into DataFrame
    res = list(zip(title, article_url, topic, type_article, article_date))
    df = pd.DataFrame(res, columns=['title','url','topic','type','date'])
    df['date'] = pd.to_datetime(df.date)
    df['url'] = df['url'].str.strip()
    df['writer'] = writer

    return df

## Run process

In [18]:
# Run function and concat blogs and tutorials
webdriver = webdriver.Chrome()
datacamp_blog = datacamp_scraping(writer = 'Javier Canales Luna', article_type = 'blog')
datacamp_tutorial = datacamp_scraping(writer = 'Javier Canales Luna', article_type = 'tutorial')
datacamp = pd.concat([datacamp_blog, datacamp_tutorial], ignore_index=True)
datacamp = datacamp.sort_values(by='date', ascending=False)

In [None]:
datacamp.to_csv('datacamp_javier_canales.csv', index=False)

In [21]:
datacamp

Unnamed: 0,title,url,topic,type,date,writer
0,Meta Learning: How Machines Learn to Learn,https://www.datacamp.com/blog/meta-learning,Artificial Intelligence,blog,2025-03-19,Javier Canales Luna
1,Excel Table: The Essential Tool for Organizing...,https://www.datacamp.com/tutorial/excel-table,Excel,tutorial,2025-04-10,Javier Canales Luna


In [None]:
import matplotlib.pyplot as plt

In [22]:
df = pd.read_csv('datacamp_javier_canales.csv')

In [23]:
df

Unnamed: 0,title,url,topic,type,date
0,Excel Table: The Essential Tool for Organizing...,https://www.datacamp.com/tutorial/excel-table,Excel,tutorial,2025-04-10
1,10 Top Data Analytics Conferences for 2025,https://www.datacamp.com/blog/top-data-analyti...,Data Analysis,blog,2025-03-30
2,Managing PostgreSQL Databases in Python with p...,https://www.datacamp.com/tutorial/tutorial-pos...,Python,tutorial,2025-03-21
3,Meta Learning: How Machines Learn to Learn,https://www.datacamp.com/blog/meta-learning,Artificial Intelligence,blog,2025-03-19
4,"Cloud Engineer Salaries in 2025: AWS, Azure, G...",https://www.datacamp.com/blog/cloud-engineer-s...,Cloud,blog,2025-02-28
...,...,...,...,...,...
74,Python 2 vs 3: Everything You Need to Know,https://www.datacamp.com/blog/python-2-vs-3-ev...,Python,blog,2022-08-23
75,Using DataLab in the Classroom,https://www.datacamp.com/blog/using-datacamp-w...,DataLab,blog,2022-04-21
76,Top 10 Data Visualization Books,https://www.datacamp.com/blog/top-10-data-visu...,Data Visualization,blog,2022-03-25
77,Python Details on Correlation Tutorial,https://www.datacamp.com/tutorial/tutorial-dat...,Python,tutorial,2022-02-25


Matplotlib is building the font cache; this may take a moment.
