## Spark Session

In [1]:
from pyspark.sql import SparkSession


# Spark Session & context 

spark = (SparkSession
         .builder
         .master("local")
            .appName("load-postgres")
        
            #Add Postgres Jar 
            .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/postgresql-42.4.0.jar")
                .getOrCreate())

sc = spark.sparkContext

## Read CSV Data

In [2]:
# Reading Movies Data 

df_movies_csv = (spark.read
                    .format("csv")
                        .option("header", True)
                            .load("/home/jovyan/work/data/movies.csv"))

In [3]:
# Reading Ratings Data 

df_ratings_csv = (spark.read
                    .format("csv")
                      .option("header", True)
                        .load("/home/jovyan/work/data/ratings.csv")
                              .withColumnRenamed("timestamp", "timestamp_epoch"))

In [4]:
# Convert epoch to timestamp and rating to DoubleType
from pyspark.sql.functions import from_unixtime, col, to_timestamp
from pyspark.sql.types import DoubleType

df_ratings_csv_fmt = (
    df_ratings_csv
    .withColumn('rating', col("rating").cast(DoubleType()))
    .withColumn('timestamp', to_timestamp(from_unixtime(col("timestamp_epoch"))))
)

## Load Data to Postgres

In [5]:
(df_movies_csv.write
 .format("jdbc")
 .option("url", "jdbc:postgresql://postgres/test")
 .option("dbtable", "public.movies")
 .option("user", "test")
 .option("password", "postgres")
 .mode("overwrite")
 .save())

In [6]:
(df_ratings_csv_fmt
 .select([c for c in df_ratings_csv_fmt.columns if c != "timestamp_epoch"])
 .write
 .format("jdbc")
 .option("url", "jdbc:postgresql://postgres/test")
 .option("dbtable", "public.ratings")
 .option("user", "test")
 .option("password", "postgres")
 .mode("overwrite")
 .save())

## ETL Process

In [8]:
pip install ipython-sql

Collecting ipython-sql
  Downloading ipython_sql-0.4.1-py3-none-any.whl (21 kB)
Collecting sqlparse
  Downloading sqlparse-0.4.2-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m599.6 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting prettytable<1
  Downloading prettytable-0.7.2.zip (28 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: prettytable
  Building wheel for prettytable (setup.py) ... [?25ldone
[?25h  Created wheel for prettytable: filename=prettytable-0.7.2-py3-none-any.whl size=13695 sha256=3256440cd07c4a0e85c8ced6c4fc08449152def3fc3010339407fee31cbe7166
  Stored in directory: /home/jovyan/.cache/pip/wheels/25/4b/07/18c5d92824315576e478206ea69df34a9e31958f6143eb0e31
Successfully built prettytable
Installing collected packages: prettytable, sqlparse, ipython-sql
Successfully installed ipython-sql-0.4.1 prettytable-0.7.2 sqlparse-0.4.2
Note: you may need to re

In [10]:
pip install sqlalchemy

Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-0.20.0-py3-none-any.whl (17 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.20.0
Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install pyscopg2

[31mERROR: Could not find a version that satisfies the requirement pyscopg2 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pyscopg2[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [14]:
pip install psycopg2-binary

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os

import pandas as pd

from dotenv import dotenv_values
from sqlalchemy import create_engine, inspect

CONFIG = dotenv_values('.env')
if not CONFIG:
    CONFIG = os.environ

connection_uri = "postgresql+psycopg2://{}:{}@{}:{}".format(
    CONFIG["airflow"],
    CONFIG["airflow"],
    CONFIG['database'],
    CONFIG["5432"],
)

KeyError: 'airflow'