In [133]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import numpy as np

### `open_web_page`

The `open_web_page` function uses a Chrome driver (via Selenium) to access a webpage specified by the provided URL argument. This function launches a Google Chrome browser instance and opens the corresponding page.

In [134]:
def open_web_page(url):
    driver = webdriver.Chrome() 
    driver.get(url)
    return driver

### `handling_with_missing_info`

This function handles the extraction of text data from a web element, specifically when scraping data from a website using Selenium. If the element specified by the given XPath exists, the function will return its text. If not, it returns `NaN` to handle missing information gracefully.

In [None]:
def handling_with_missing_info(repos,xpath):
    try:
        return repos.find_element(By.XPATH,xpath).text
    except:
        return np.nan

### `info_from_user`

This function scrapes a user's profile and repository information from a webpage using Selenium. It extracts the user's full name and description, followed by scraping multiple details about their repositories such as name, URL, description, programming language, stars, forks, license, and last update time. The collected data is appended to two lists: `user_info` and `repos_info`.


In [135]:
repos_info = []
user_info = []

def info_from_user(driver):
    description = driver.find_element(By.XPATH,'/html/body/div[1]/div[4]/main/div[2]/div/div[1]/div/div[2]/div[3]/div[2]/div[1]/div').text
    full_name = driver.find_element(By.XPATH,'//*/div/div[1]/div/div[2]/div[1]/div[2]/h1/span[1]').text
    user_info.append([full_name,description])

    repos = driver.find_elements(By.XPATH,'/html/body/div[1]/div[4]/main/div[2]/div/div[2]/turbo-frame/div/div[2]/ul/li')
    for repo in repos:
        repo_name = handling_with_missing_info(repo,'div[1]/div[1]/h3/a')
        repo_url = repo.find_element(By.XPATH,'div[1]/div[1]/h3/a').get_attribute('href')
        repo_desc = handling_with_missing_info(repo,'div[1]/div[2]/p')
        repo_lenguaje = handling_with_missing_info(repo,'div[1]//span[1]/span[2]')
        stars = handling_with_missing_info(repo,'div[1]/div[4]/a[1]')
        forks = handling_with_missing_info(repo,"div[1]/div[@class='f6 color-fg-muted mt-2']/a[2]")
        license = handling_with_missing_info(repo,'div[1]/div[4]/span[2]')
        last_update = repo.find_element(By.XPATH,'div[1]//relative-time').get_attribute('title')
        repos_info.append([repo_name,repo_url,repo_desc,repo_lenguaje,stars,forks,license,last_update])
    driver.quit()

### `Execution`

In [136]:
user = 'haroldeustaquio'
url = f'https://github.com/{user}?tab=repositories'
driver = open_web_page(url)
info_from_user(driver)

In [137]:
df_info = pd.DataFrame(user_info,columns=['full_name','description'])
df_repos = pd.DataFrame(repos_info,columns=['repo_name','repo_url','repo_desc','repo_lenguaje','stars','forks','license','last_update'])

In [140]:
df_info.to_csv('user_info.csv')
df_repos.to_csv('repos_info.csv')

In [138]:
df_info

Unnamed: 0,full_name,description
0,Harold Eustaquio,"Electronic and Computer Engineering student, p..."


In [139]:
df_repos.head()

Unnamed: 0,repo_name,repo_url,repo_desc,repo_lenguaje,stars,forks,license,last_update
0,SQL-Coding-Challenges,https://github.com/haroldeustaquio/SQL-Coding-...,Repository dedicated to solving SQL problems f...,TSQL,5.0,1.0,MIT License,"Oct 13, 2024, 3:10 PM CST"
1,Python-Coding-Challenges,https://github.com/haroldeustaquio/Python-Codi...,Repository dedicated to solving Python problem...,Jupyter Notebook,6.0,,MIT License,"Oct 13, 2024, 5:00 AM CST"
2,Melody-Lyric-Generator-from-Sheet-Music,https://github.com/haroldeustaquio/Melody-Lyri...,,Python,,1.0,,"Oct 13, 2024, 2:31 AM CST"
3,Machine-Learning,https://github.com/haroldeustaquio/Machine-Lea...,This repository contains Machine Learning mini...,Jupyter Notebook,6.0,,MIT License,"Oct 13, 2024, 2:13 AM CST"
4,Web-Scraping-with-Python,https://github.com/haroldeustaquio/Web-Scrapin...,,Jupyter Notebook,,,,"Oct 12, 2024, 11:51 PM CST"
