In [11]:
!pip install requests beautifulsoup4 pandas



In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

In [15]:
# Download the necessary resources for sentence tokenization
nltk.download('punkt')

# Function to extract data from a single page (div elements)
def extract_data_from_page_div(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting data from 'div' tags
    raw_data = []
    div_sentences = soup.find_all('div', class_="flex flex-col items-start gap-2")

    exclude_phrases = [
        "INFJ", "INTP", "ISFJ", "ESFJ", "ENFP", "ENTJ", "ESTJ", "ISTJ", "ENTP", "ESTP", "ISTP", "ENFJ", "INFP", "ISFP", "ESFP",
        "Aries", "Sagittarius", "Scorpio", "Capricorn", "Virgo", "Aquarius", "Gemini", "Taurus", "Leo", "Pisces", "Cancer", "Libra"
    ]

    for div in div_sentences:
        sentences = div.find_all('p')
        for sentence in sentences:
            text = sentence.get_text(strip=True, separator=' ')
            if not any(phrase in text for phrase in exclude_phrases):
                raw_data.extend(sent_tokenize(text))

    return raw_data

# Function to extract data from a single page (li elements)
def extract_data_from_page_li(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting data from 'li' tags
    raw_data = []
    ol_elements = soup.find_all('ol')

    for ol_element in ol_elements:
        li_elements = ol_element.find_all('li')
        for li in li_elements:
            text = li.get_text(strip=True, separator=' ')
            raw_data.extend(sent_tokenize(text))

    return raw_data

# Extract and display data
base_url_div = 'https://boo.world/database/profile/20463/hinata-hyuga-personality-type'
base_url_li = 'https://greatcharacters.miraheze.org/wiki/Hinata_Hyuga'

all_data_div = extract_data_from_page_div(base_url_div)
all_data_li = extract_data_from_page_li(base_url_li)

# Combine the data
all_data = all_data_div + all_data_li

# Display extracted data without headers
df_extracted = pd.DataFrame(all_data, columns=['Sentence'])
print(df_extracted.to_string(header=False))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0                                                                                                                                                                                                                                                                                                                                                                          Hinata Hyuga is a fictional character from the highly popular anime and manga series Naruto created by Masashi Kishimoto.
1                                                                                                                                                                                                                               She was initially introduced as an innocent, shy and timid young girl from the powerful Hyuga clan, which is renowned for their exceptional abilities in the use of the Byakugan - a distinctive technique that empowers them to see through anything close to them.
2                             

# Data Cleaning

In [16]:
# Check for null values in the DataFrame
null_summary = df_extracted.isnull().sum()

# Print the summary of null values
print("Summary of null values in each column:")
print(null_summary)

Summary of null values in each column:
Sentence    0
dtype: int64


In [17]:
# Check for duplicates based on all columns
duplicates = df_extracted.duplicated()

# Print the rows that are duplicates
print("Duplicate Rows:")
print(df_extracted[duplicates])

Duplicate Rows:
                                             Sentence
52  Also, the scene where Toneri possesses Hinata ...
55  Coincidently, both characters has have a crush...


In [22]:
df_extracted = df_extracted.drop_duplicates()

df_extracted.reset_index(drop=True, inplace=True)

df_extracted

Unnamed: 0,Sentence
0,Hinata Hyuga is a fictional character from the...
1,"She was initially introduced as an innocent, s..."
2,Hinata’s character is a member of Team Kurenai...
3,Hinata’s character is highly intricately woven...
4,Originally depicted as a passive and fragile g...
5,Hinata’s story is an inspiring one and resonat...
6,Hinata’s character becomes even more interesti...
7,"Her character development series, notably Naru..."
8,These events solidify her as a well-loved char...
9,With her exceptional fighting skills backed by...


In [24]:
# Save the cleaned DataFrame to a new CSV file
df_extracted.to_csv('hinata_personality.csv', index=False, header=False)

# Convert CSV file to .txt file
df_extracted.to_csv('hinata_personality.txt', sep='\t')