In [2]:
import requests
import json

def fetch_countries_data():
    # Using session is particularly beneficial if you are making multiple requests to the same server, 
    # as it can reuse the underlying TCP connection, 
    # leading to performance improvements.
    with requests.Session() as session:
        url = "https://restcountries.com/v3.1/all"
        response = session.get(url)
        response.raise_for_status()
        
        if response.status_code == 200:
            return response.json()
        else:
            return f"Error: {response.status_code}"

# Fetch and print data
countries_data = fetch_countries_data()
# print(type(countries_data))


In [4]:
import os
os.environ["MINIO_KEY"] = "minio"
os.environ["MINIO_SECRET"] = "minio123"

In [12]:
!pip install minio

Collecting minio
  Downloading minio-7.2.0-py3-none-any.whl.metadata (4.3 kB)
Collecting pycryptodome (from minio)
  Downloading pycryptodome-3.19.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading minio-7.2.0-py3-none-any.whl (83 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.5/83.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading pycryptodome-3.19.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: pycryptodome, minio
Successfully installed minio-7.2.0 pycryptodome-3.19.0


In [13]:
!pip install s3fs

Collecting s3fs
  Downloading s3fs-2023.12.2-py3-none-any.whl.metadata (1.6 kB)
Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)
  Downloading aiobotocore-2.9.0-py3-none-any.whl.metadata (20 kB)
Collecting fsspec==2023.12.2 (from s3fs)
  Downloading fsspec-2023.12.2-py3-none-any.whl.metadata (6.8 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from s3fs)
  Downloading aiohttp-3.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting botocore<1.33.14,>=1.33.2 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading botocore-1.33.13-py3-none-any.whl.metadata (6.1 kB)
Collecting wrapt<2.0.0,>=1.10.10 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading wrapt-1.16.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading aioitertools-0.11.0-py3-none-any.whl (23 kB)
Collecting multidict<7.0,>=4.5 (from aiohttp!=4.0.

In [6]:
from minio import Minio
import pandas as pd
import s3fs

fs = s3fs.S3FileSystem(
    client_kwargs={'endpoint_url': 'http://minio1:9000'},
    key=os.environ["MINIO_KEY"],
    secret=os.environ["MINIO_SECRET"],
    use_ssl=False  # Set to True if MinIO is set up with SSL
)

In [7]:
with fs.open('mybucket/data.json', 'w', encoding='utf-8') as f:
    json.dump(countries_data,f)

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as F

spark = SparkSession.builder \
    .appName("MinIO Test") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.11.1026") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio1:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minio") \
    .config("spark.hadoop.fs.s3a.secret.key", "minio123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .enableHiveSupport() \
    .getOrCreate()

In [10]:
df = spark.read.option("inferSchema",True).json("s3a://mybucket/data.json")

In [15]:
# df.printSchema()

In [12]:
df.count()

250

In [14]:
# df.show(2,truncate = False)

In [16]:
df.write.mode("overwrite").format("parquet").save("s3a://mybucket/raw_data.parquet")

In [2]:
raw_data = spark.read.parquet("s3a://mybucket/raw_data.parquet")
raw_data.count()

250

In [9]:
transformed_data = (
    raw_data.selectExpr(
        "name.common as cntry_name",
        "area as cntry_area",
        "borders as border_cntry",
        "capital as capital_cities",
        "continents as cntry_continent",
        "landlocked as is_landlocked",
        "population",
        "startOfWeek",
        "timezones as nr_timezones",
        "unMember as is_unmember"
    )
)
transformed_data.printSchema()

root
 |-- cntry_name: string (nullable = true)
 |-- cntry_area: double (nullable = true)
 |-- border_cntry: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- capital_cities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- cntry_continent: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- is_landlocked: boolean (nullable = true)
 |-- population: long (nullable = true)
 |-- startOfWeek: string (nullable = true)
 |-- nr_timezones: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- is_unmember: boolean (nullable = true)



In [10]:
transformed_data.show(10,truncate = False)

+----------------+----------+-------------------------+------------------+---------------+-------------+----------+-----------+---------------------------------+-----------+
|cntry_name      |cntry_area|border_cntry             |capital_cities    |cntry_continent|is_landlocked|population|startOfWeek|nr_timezones                     |is_unmember|
+----------------+----------+-------------------------+------------------+---------------+-------------+----------+-----------+---------------------------------+-----------+
|Christmas Island|135.0     |NULL                     |[Flying Fish Cove]|[Asia]         |false        |2072      |monday     |[UTC+07:00]                      |false      |
|Eritrea         |117600.0  |[DJI, ETH, SDN]          |[Asmara]          |[Africa]       |false        |5352000   |monday     |[UTC+03:00]                      |true       |
|Samoa           |2842.0    |NULL                     |[Apia]            |[Oceania]      |false        |198410    |monday     |[UT

In [11]:
transformed_data.write.mode("overwrite").format("parquet").save("s3a://mybucket/trnsfm_data.parquet")

In [12]:
spark.sql("""
CREATE EXTERNAL TABLE country_data (
    cntry_name STRING,
    cntry_area DOUBLE,
    border_cntry ARRAY<STRING>,
    capital_cities ARRAY<STRING>,
    cntry_continent ARRAY<STRING>,
    is_landlocked BOOLEAN,
    population BIGINT,
    startOfWeek STRING,
    nr_timezones ARRAY<STRING>,
    is_unmember BOOLEAN
)
STORED AS PARQUET
LOCATION 's3a://mybucket/trnsfm_data.parquet';
""").show()

++
||
++
++



In [13]:
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [18]:
spark.sql("DESCRIBE EXTENDED default.country_data").show(100,truncate = False)

+----------------------------+--------------------------------------------------------------+-------+
|col_name                    |data_type                                                     |comment|
+----------------------------+--------------------------------------------------------------+-------+
|cntry_name                  |string                                                        |NULL   |
|cntry_area                  |double                                                        |NULL   |
|border_cntry                |array<string>                                                 |NULL   |
|capital_cities              |array<string>                                                 |NULL   |
|cntry_continent             |array<string>                                                 |NULL   |
|is_landlocked               |boolean                                                       |NULL   |
|population                  |bigint                                              

In [21]:
# raw_data.printSchema()

In [8]:
# import os

# # Retrieve all environment variables
# environment_variables = os.environ

# # Print each environment variable
# for key, value in environment_variables.items():
#     print(f"{key}: {value}")