### Conneting to database and importing libaries

In [4]:
# Import required libraries
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
%load_ext sql
from IPython.display import Image, display

# Load environment variables
load_dotenv()

# Configure pandas display format
pd.options.display.float_format = '{:.2f}'.format

# Get database credentials from environment variables
DB_PASSWORD = os.getenv('DB_PASSWORD')

# Set the DATABASE_URL environment variable explicitly
os.environ['DATABASE_URL'] = f"postgresql://postgres:{DB_PASSWORD}@localhost:5432/contoso_100k"

# Connect using the environment variable
%sql ${DATABASE_URL}

# Enable automatic conversion of SQL results to pandas DataFrames
%config SqlMagic.autopandas = True

# Disable named parameters for SQL magic
%config SqlMagic.named_parameters = "disabled"

# Test the connection with a simple query
%sql SELECT version();

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


Unnamed: 0,version
0,"PostgreSQL 17.4 on x86_64-windows, compiled by..."


#### Exploring the contoso database for tables

In [3]:
%%sql
SELECT table_name 
FROM information_schema.tables 
WHERE table_schema = 'public'
ORDER BY table_name;

UsageError: No active connection.

To fix it:

Pass a valid connection string:
    Example: %sql postgresql://username:password@hostname/dbname

OR

Set the environment variable $DATABASE_URL

For more details, see: https://jupysql.ploomber.io/en/latest/connecting.html


#### Exploring the columns in sales table

In [8]:
%%sql
SELECT *
from information_schema.columns
Where table_name = 'sales'

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,contoso_100k,public,sales,exchangerate,13,,YES,double precision,,,...,NO,,,,,,NO,NEVER,,YES
1,contoso_100k,public,sales,linenumber,2,,NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
2,contoso_100k,public,sales,orderdate,3,,YES,date,,,...,NO,,,,,,NO,NEVER,,YES
3,contoso_100k,public,sales,deliverydate,4,,YES,date,,,...,NO,,,,,,NO,NEVER,,YES
4,contoso_100k,public,sales,customerkey,5,,YES,integer,,,...,NO,,,,,,NO,NEVER,,YES
5,contoso_100k,public,sales,storekey,6,,YES,integer,,,...,NO,,,,,,NO,NEVER,,YES
6,contoso_100k,public,sales,productkey,7,,YES,integer,,,...,NO,,,,,,NO,NEVER,,YES
7,contoso_100k,public,sales,quantity,8,,YES,integer,,,...,NO,,,,,,NO,NEVER,,YES
8,contoso_100k,public,sales,unitprice,9,,YES,double precision,,,...,NO,,,,,,NO,NEVER,,YES
9,contoso_100k,public,sales,netprice,10,,YES,double precision,,,...,NO,,,,,,NO,NEVER,,YES


In [None]:
import plotly.express as px
import calendar

# Create months list for 2024
months = [f"2023-{calendar.month_abbr[i]}" for i in range(1, 15)]

# Create the line chart
px.line(
    _, 
    x=_.order_month.apply(lambda x: f"2023-{calendar.month_abbr[int(x.split('-')[1])]}"),
    y='running_total_sales',
    markers=True,
    title='Running Total Sales Over Time',
    labels={'x': 'Month', 'running_total_sales': 'Running Total Sales ($)'}
).update_layout(
    xaxis={
        'categoryorder': 'array',
        'categoryarray': months
    }
).show()

IndexError: list index out of range

#### Isolating only columns

In [9]:
%%sql
SELECT column_name
from information_schema.columns
Where table_name = 'sales'

Unnamed: 0,column_name
0,exchangerate
1,linenumber
2,orderdate
3,deliverydate
4,customerkey
5,storekey
6,productkey
7,quantity
8,unitprice
9,netprice


#### Isolating the columns in products table

In [10]:
%%sql
SELECT column_name
from information_schema.columns
Where table_name = 'product'

Unnamed: 0,column_name
0,productkey
1,productcode
2,weight
3,cost
4,price
5,categorykey
6,subcategorykey
7,categoryname
8,subcategoryname
9,productname


#### Isolating the columns in customer table

In [11]:
%%sql
SELECT column_name
from information_schema.columns
Where table_name = 'customer'

Unnamed: 0,column_name
0,customerkey
1,geoareakey
2,startdt
3,enddt
4,birthday
5,age
6,latitude
7,longitude
8,middleinitial
9,surname
