#### QUESTION 1: Read this url and find the 10 most frequent words. romeo_and_juliet = 'http://www.gutenberg.org/files/1112/1112.txt'


In [2]:
import requests
from bs4 import BeautifulSoup
from collections import Counter
import re

def get_text_from_url(url):
    # Fetch the content of the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        return response.text
    else:
        # Print an error message if the request was not successful
        print(f"Failed to fetch content from {url}. Status code: {response.status_code}")
        return None

def clean_and_tokenize_text(text):
    # Use BeautifulSoup to remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text()

    # Remove non-alphabetic characters and convert to lowercase
    clean_text = re.sub(r'[^a-zA-Z\s]', '', clean_text)
    clean_text = clean_text.lower()

    # Tokenize the text into words
    tokens = clean_text.split()

    return tokens

def find_most_frequent_words(tokens, num_words=10):
    # Use Counter to count the occurrences of each word
    word_counter = Counter(tokens)

    # Get the most common words
    most_common_words = word_counter.most_common(num_words)

    return most_common_words

# Updated URL of Romeo and Juliet text from Project Gutenberg
romeo_and_juliet_url = 'http://www.gutenberg.org/cache/epub/1112/pg1112.txt'

# Get text from the URL
text_from_url = get_text_from_url(romeo_and_juliet_url)

if text_from_url:
    # Clean and tokenize the text
    tokens = clean_and_tokenize_text(text_from_url)

    # Find the 10 most frequent words
    most_frequent_words = find_most_frequent_words(tokens, num_words=10)

    # Print the results
    print("Ten Most Frequent Words:")
    for word, count in most_frequent_words:
        print(f'{count}: {word}')


Ten Most Frequent Words:
849: the
761: and
630: to
597: i
529: a
507: of
376: in
374: my
363: you
362: is


#### QUESTION 2: Read the cats API and cats_api = 'https://api.thecatapi.com/v1/breeds' and find : 1, the min, max, mean, median, standard deviation of cats' weight in metric units. ii, the min, max, mean, median, standard deviation of cats' lifespan in years. iii, Create a frequency table of country and breed of cats

In [4]:
import requests
import pandas as pd
import numpy as np

# Cat API URL
cats_api = 'https://api.thecatapi.com/v1/breeds'

# Fetch data from the Cat API
response = requests.get(cats_api)

# Check if the request was successful
if response.status_code == 200:
    # Parse JSON data
    cat_data = response.json()

    # Create a DataFrame from the JSON data
    df = pd.DataFrame(cat_data)

    # Convert weight to metric units (assuming the weight is in grams)
    df['weight.metric'] = df['weight'].apply(lambda x: x['metric'])

    # Extract numerical values for weight and lifespan
    df['weight.metric'] = pd.to_numeric(df['weight.metric'], errors='coerce')
    df['life_span.years'] = df['life_span'].str.extract('(\d+)').astype(float)

    # Display summary statistics for weight and lifespan
    weight_stats = df['weight.metric'].describe()
    lifespan_stats = df['life_span.years'].describe()

    # Create a frequency table of country and breed
    frequency_table = pd.crosstab(df['origin'], df['name'])

    print("\nSummary Statistics for Weight:")
    print(weight_stats)

    print("\nSummary Statistics for Lifespan:")
    print(lifespan_stats)

    print("\nFrequency Table of Country and Breed:")
    print(frequency_table)
else:
    print(f"Failed to fetch data from the Cat API. Status code: {response.status_code}")



Summary Statistics for Weight:
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: weight.metric, dtype: float64

Summary Statistics for Lifespan:
count    67.000000
mean     12.074627
std       1.828341
min       8.000000
25%      11.000000
50%      12.000000
75%      12.500000
max      18.000000
Name: life_span.years, dtype: float64

Frequency Table of Country and Breed:
name                  Abyssinian  Aegean  American Bobtail  American Curl  \
origin                                                                      
Australia                      0       0                 0              0   
Burma                          0       0                 0              0   
Canada                         0       0                 0              0   
China                          0       0                 0              0   
Cyprus                         0       0                 0              0   
Egypt                     

#### QUESTION 3: Read the countries API and find i, the 10 largest countries. ii, the 10 most spoken languages. iii, the total number of languages in the countries API

In [6]:
import requests

# Countries API URL
countries_api = 'https://restcountries.com/v3.1/all'

# Fetch data from the Countries API
response = requests.get(countries_api)

# Check if the request was successful
if response.status_code == 200:
    # Parse JSON data
    countries_data = response.json()

    # Create a DataFrame from the JSON data
    countries_df = pd.DataFrame(countries_data)

    # Find the 10 largest countries
    largest_countries = countries_df.nlargest(10, 'area')

    # Find the 10 most spoken languages
    languages_count = countries_df['languages'].explode().value_counts()
    most_spoken_languages = languages_count.nlargest(10)

    # Calculate the total number of languages
    total_languages = languages_count.size
    
    # Print column names
    print("Column Names:", largest_countries.columns)
    
    # Print 10 Largest Countries
    print("\n10 Largest Countries:")
    print(largest_countries)
    
    # Print most spokrn languages
    print("\n10 Most Spoken Languages:")
    print(most_spoken_languages)
    
    # print total number of labguages
    print(f"\nTotal Number of Languages: {total_languages}")

else:
    print(f"Failed to fetch data from the Countries API. Status code: {response.status_code}")


Column Names: Index(['name', 'tld', 'cca2', 'ccn3', 'cca3', 'independent', 'status',
       'unMember', 'currencies', 'idd', 'capital', 'altSpellings', 'region',
       'subregion', 'languages', 'translations', 'latlng', 'landlocked',
       'area', 'demonyms', 'flag', 'maps', 'population', 'car', 'timezones',
       'continents', 'flags', 'coatOfArms', 'startOfWeek', 'capitalInfo',
       'postalCode', 'cioc', 'borders', 'fifa', 'gini'],
      dtype='object')

10 Largest Countries:
                                                  name  \
218  {'common': 'Russia', 'official': 'Russian Fede...   
120  {'common': 'Antarctica', 'official': 'Antarcti...   
237  {'common': 'Canada', 'official': 'Canada', 'na...   
84   {'common': 'China', 'official': 'People's Repu...   
231  {'common': 'United States', 'official': 'Unite...   
68   {'common': 'Brazil', 'official': 'Federative R...   
222  {'common': 'Australia', 'official': 'Commonwea...   
100  {'common': 'India', 'official': 'Republic o

#### QUESTION 4: UCI is one of the most common places to get data sets for data science and machine learning. Read the content of UCL (https://archive.ics.uci.edu/ml/datasets.php). Without additional libraries it will be difficult, so you may try it with BeautifulSoup4


In [7]:
import requests
from bs4 import BeautifulSoup

# URL of the UCI Machine Learning Repository
uci_url = "https://archive.ics.uci.edu/ml/datasets.php"

# Make a request to the URL
response = requests.get(uci_url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract and print the text content of the page
    print(soup.get_text())
else:
    print(f"Failed to fetch content. Status code: {response.status_code}")


Failed to fetch content. Status code: 404
