In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests


In [4]:
def scrape_and_save_data():
    url = "https://github.com/MarcusGrum/AIBAS/blob/main/README.md"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Scraping example: extracting text from the website
    table = soup.find('table')  # Assuming a table structure in the HTML
    if table:
        rows = table.find_all('tr')
        data = [[cell.text.strip() for cell in row.find_all(['td', 'th'])] for row in rows]
    else:
        print("No table found on the webpage.")
        return

    if not data or len(data) < 2:  # Ensure at least one row of data and headers exist
        print("Scraped data is empty or invalid.")
        return

    # Convert the data into a DataFrame
    try:
        df = pd.DataFrame(data[1:], columns=data[0])  # Skip headers for rows, use first row as columns
        df.to_csv("scraped_data.csv", index=False)
        print("Data scraped and saved successfully.")
    except Exception as e:
        print(f"Error creating DataFrame: {e}")

In [9]:
data = pd.read_csv('scraped_data.csv')
data.head()

Unnamed: 0,x,y
0,10.611376985560522,-0.258198
1,4.386447954473255,0.007454
2,40.93916393492107,48.647258
3,47.51486916723017,47.497798
4,2.4025396488257247,1.414437


In [10]:
# Remove empty rows and columns
data = data.dropna(how='all').dropna(axis=1, how='all')

# Convert numeric columns to appropriate types
for col in data.columns:
    try:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    except ValueError:
        pass

# Handle missing values by filling numeric columns with their median
data = data.fillna(data.median(numeric_only=True))

# Display cleaned data
data.head()


Unnamed: 0,x,y
0,10.611377,-0.258198
1,4.386448,0.007454
2,40.939164,48.647258
3,47.514869,47.497798
4,2.40254,1.414437


In [11]:
# Calculate Z-scores for numeric columns to detect outliers
numeric_cols = data.select_dtypes(include=[np.number])
z_scores = (numeric_cols - numeric_cols.mean()) / numeric_cols.std()

# Remove rows where any numeric column has a Z-score > 3
data = data[(np.abs(z_scores) < 3).all(axis=1)]

# Display data after outlier removal
data.head()


  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,x,y
0,10.611377,-0.258198
1,4.386448,0.007454
2,40.939164,48.647258
3,47.514869,47.497798
4,2.40254,1.414437


In [12]:
# Normalize numeric columns using Min-Max scaling
for col in numeric_cols.columns:
    data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())

# Display normalized data
data.head()


Unnamed: 0,x,y
0,0.211976,6.694553e-159
1,0.087438,6.960205000000001e-159
2,0.818725,5.560001e-158
3,0.950282,5.445055e-158
4,0.047747,8.367187000000001e-159


In [13]:
# Save the cleaned and processed data to a CSV file
output_file = "UE_06_dataset04_joint_scraped_data.csv"
data.to_csv(output_file, index=False)

print(f"Data saved successfully to {output_file}")


Data saved successfully to UE_06_dataset04_joint_scraped_data.csv
