### Import Libraries

In [1]:
#import needed libraries
from sqlalchemy import create_engine
import pandas as pd
import pyodbc
import os

In [2]:
#get password from environmnet var
pwd = os.environ['PGPASS']
uid = os.environ['PGUID']
driver = "{SQL Server Native Client 11.0}"
server = "localhost"
database = "AdventureWorksDW2017;"

### Establish Connection

In [3]:
# Source connection: sql server
src_conn = pyodbc.connect('DRIVER=' + driver + ';SERVER=' + server + '\SQLEXPRESS' + ';DATABASE=' + database + ';UID=' + uid + ';PWD=' + pwd)

In [4]:
# Destination: Postgres
engine = create_engine(f'postgresql://{uid}:{pwd}@{server}:5432/AdventureWorks')

### Read Data

In [5]:
source = pd.read_sql_query(""" SELECT top 10
CustomerKey,GeographyKey,CustomerAlternateKey,Title,FirstName,MiddleName,LastName,NameStyle,BirthDate,MaritalStatus
FROM dbo.DimCustomer; """, src_conn)
source

Unnamed: 0,CustomerKey,GeographyKey,CustomerAlternateKey,Title,FirstName,MiddleName,LastName,NameStyle,BirthDate,MaritalStatus
0,11000,26,AW00011000,,Jon,V,Yang,False,1971-10-06,M
1,11001,37,AW00011001,,Eugene,L,Huang,False,1976-05-10,S
2,11002,31,AW00011002,,Ruben,,Torres,False,1971-02-09,M
3,11003,11,AW00011003,,Christy,,Zhu,False,1973-08-14,S
4,11004,19,AW00011004,,Elizabeth,,Johnson,False,1979-08-05,S
5,11005,22,AW00011005,,Julio,,Ruiz,False,1976-08-01,S
6,11006,8,AW00011006,,Janet,G,Alvarez,False,1976-12-02,S
7,11007,40,AW00011007,,Marco,,Mehta,False,1969-11-06,M
8,11008,32,AW00011008,,Rob,,Verhoff,False,1975-07-04,S
9,11009,25,AW00011009,,Shannon,C,Carlson,False,1969-09-29,S


### Load initial Data to Target | Read Target Data

In [7]:
# Save the data to destination as the intial load. On the first run we load all data.
tbl_name = "stg_IncrementalLoadTest"
source.to_sql(tbl_name, engine, if_exists='replace', index=False)

In [8]:
# Read Target data into a dataframe
target = pd.read_sql('Select * from public."stg_IncrementalLoadTest"', engine)
target

Unnamed: 0,CustomerKey,GeographyKey,CustomerAlternateKey,Title,FirstName,MiddleName,LastName,NameStyle,BirthDate,MaritalStatus
0,11000,26,AW00011000,,Jon,V,Yang,False,1971-10-06,M
1,11001,37,AW00011001,,Eugene,L,Huang,False,1976-05-10,S
2,11002,31,AW00011002,,Ruben,,Torres,False,1971-02-09,M
3,11003,11,AW00011003,,Christy,,Zhu,False,1973-08-14,S
4,11004,19,AW00011004,,Elizabeth,,Johnson,False,1979-08-05,S
5,11005,22,AW00011005,,Julio,,Ruiz,False,1976-08-01,S
6,11006,8,AW00011006,,Janet,G,Alvarez,False,1976-12-02,S
7,11007,40,AW00011007,,Marco,,Mehta,False,1969-11-06,M
8,11008,32,AW00011008,,Rob,,Verhoff,False,1975-07-04,S
9,11009,25,AW00011009,,Shannon,C,Carlson,False,1969-09-29,S


### Read Update Source Data

In [9]:
# Let's select two additional rows from the source. We have two new records
source = pd.read_sql_query(""" SELECT top 12
CustomerKey,GeographyKey,CustomerAlternateKey,Title,FirstName,MiddleName,LastName,NameStyle,BirthDate,MaritalStatus
FROM dbo.DimCustomer; """, src_conn)
source

Unnamed: 0,CustomerKey,GeographyKey,CustomerAlternateKey,Title,FirstName,MiddleName,LastName,NameStyle,BirthDate,MaritalStatus
0,11000,26,AW00011000,,Jon,V,Yang,False,1971-10-06,M
1,11001,37,AW00011001,,Eugene,L,Huang,False,1976-05-10,S
2,11002,31,AW00011002,,Ruben,,Torres,False,1971-02-09,M
3,11003,11,AW00011003,,Christy,,Zhu,False,1973-08-14,S
4,11004,19,AW00011004,,Elizabeth,,Johnson,False,1979-08-05,S
5,11005,22,AW00011005,,Julio,,Ruiz,False,1976-08-01,S
6,11006,8,AW00011006,,Janet,G,Alvarez,False,1976-12-02,S
7,11007,40,AW00011007,,Marco,,Mehta,False,1969-11-06,M
8,11008,32,AW00011008,,Rob,,Verhoff,False,1975-07-04,S
9,11009,25,AW00011009,,Shannon,C,Carlson,False,1969-09-29,S


### Update a Source Record. Serve as a modified row

In [10]:
# Also update a record. I will update the middle name for customerkey: 11006
source.loc[source.MiddleName =='G', ['MiddleName']] = 'Gina'
source

Unnamed: 0,CustomerKey,GeographyKey,CustomerAlternateKey,Title,FirstName,MiddleName,LastName,NameStyle,BirthDate,MaritalStatus
0,11000,26,AW00011000,,Jon,V,Yang,False,1971-10-06,M
1,11001,37,AW00011001,,Eugene,L,Huang,False,1976-05-10,S
2,11002,31,AW00011002,,Ruben,,Torres,False,1971-02-09,M
3,11003,11,AW00011003,,Christy,,Zhu,False,1973-08-14,S
4,11004,19,AW00011004,,Elizabeth,,Johnson,False,1979-08-05,S
5,11005,22,AW00011005,,Julio,,Ruiz,False,1976-08-01,S
6,11006,8,AW00011006,,Janet,Gina,Alvarez,False,1976-12-02,S
7,11007,40,AW00011007,,Marco,,Mehta,False,1969-11-06,M
8,11008,32,AW00011008,,Rob,,Verhoff,False,1975-07-04,S
9,11009,25,AW00011009,,Shannon,C,Carlson,False,1969-09-29,S


### Detect Changes in data by comparing source and target

In [12]:
target.apply(tuple,1)

0    (11000, 26, AW00011000, None, Jon, V, Yang, Fa...
1    (11001, 37, AW00011001, None, Eugene, L, Huang...
2    (11002, 31, AW00011002, None, Ruben, None, Tor...
3    (11003, 11, AW00011003, None, Christy, None, Z...
4    (11004, 19, AW00011004, None, Elizabeth, None,...
5    (11005, 22, AW00011005, None, Julio, None, Rui...
6    (11006, 8, AW00011006, None, Janet, G, Alvarez...
7    (11007, 40, AW00011007, None, Marco, None, Meh...
8    (11008, 32, AW00011008, None, Rob, None, Verho...
9    (11009, 25, AW00011009, None, Shannon, C, Carl...
dtype: object

In [14]:
source.apply(tuple,1).isin(target.apply(tuple,1))

0      True
1      True
2      True
3      True
4      True
5      True
6     False
7      True
8      True
9      True
10    False
11    False
dtype: bool

In [15]:
# detech changes. Get rows that are not present in the target.
changes = source[~source.apply(tuple,1).isin(target.apply(tuple,1))]
changes

Unnamed: 0,CustomerKey,GeographyKey,CustomerAlternateKey,Title,FirstName,MiddleName,LastName,NameStyle,BirthDate,MaritalStatus
6,11006,8,AW00011006,,Janet,Gina,Alvarez,False,1976-12-02,S
10,11010,22,AW00011010,,Jacquelyn,C,Suarez,False,1969-08-05,S
11,11011,22,AW00011011,,Curtis,,Lu,False,1969-05-03,M


In [16]:
# Get modified rows
modified = changes[changes.CustomerKey.isin(target.CustomerKey)]
modified

Unnamed: 0,CustomerKey,GeographyKey,CustomerAlternateKey,Title,FirstName,MiddleName,LastName,NameStyle,BirthDate,MaritalStatus
6,11006,8,AW00011006,,Janet,Gina,Alvarez,False,1976-12-02,S


In [17]:
# Get new records
inserts = changes[~changes.CustomerKey.isin(target.CustomerKey)]
inserts

Unnamed: 0,CustomerKey,GeographyKey,CustomerAlternateKey,Title,FirstName,MiddleName,LastName,NameStyle,BirthDate,MaritalStatus
10,11010,22,AW00011010,,Jacquelyn,C,Suarez,False,1969-08-05,S
11,11011,22,AW00011011,,Curtis,,Lu,False,1969-05-03,M


### Upsert data to target table

In [21]:
def update_to_sql(df, table_name, key_name):
    a = []
    table = table_name
    primary_key = key_name
    temp_table = f"{table_name}_temporary_table"
    for col in df.columns:
        if col == primary_key:
            continue
        a.append(f'"{col}"=s."{col}"')
    df.to_sql(temp_table, engine, if_exists='replace', index=False)
    update_stmt_1 = f'UPDATE public."{table}" f '
    update_stmt_2 = "SET "
    update_stmt_3 = ", ".join(a)
    update_stmt_4 = f' FROM public."{table}" t '
    update_stmt_5 = f' INNER JOIN (SELECT * FROM public."{temp_table}") AS s ON s."{primary_key}"=t."{primary_key}" '
    update_stmt_6 = f' Where f."{primary_key}"=s."{primary_key}" '
    update_stmt_7 = update_stmt_1 + update_stmt_2 + update_stmt_3 + update_stmt_4 + update_stmt_5 +  update_stmt_6 +";"
    print(update_stmt_7)
    with engine.begin() as cnx:
        cnx.execute(update_stmt_7)

In [24]:
# Call update function
update_to_sql(modified, "stg_IncrementalLoadTest", "CustomerKey")

UPDATE public."stg_IncrementalLoadTest" f SET "GeographyKey"=s."GeographyKey", "CustomerAlternateKey"=s."CustomerAlternateKey", "Title"=s."Title", "FirstName"=s."FirstName", "MiddleName"=s."MiddleName", "LastName"=s."LastName", "NameStyle"=s."NameStyle", "BirthDate"=s."BirthDate", "MaritalStatus"=s."MaritalStatus" FROM public."stg_IncrementalLoadTest" t  INNER JOIN (SELECT * FROM public."stg_IncrementalLoadTest_temporary_table") AS s ON s."CustomerKey"=t."CustomerKey"  Where f."CustomerKey"=s."CustomerKey" ;


In [25]:
target = pd.read_sql('Select * from public."stg_IncrementalLoadTest"', engine)
target

Unnamed: 0,CustomerKey,GeographyKey,CustomerAlternateKey,Title,FirstName,MiddleName,LastName,NameStyle,BirthDate,MaritalStatus
0,11000,26,AW00011000,,Jon,V,Yang,False,1971-10-06,M
1,11001,37,AW00011001,,Eugene,L,Huang,False,1976-05-10,S
2,11002,31,AW00011002,,Ruben,,Torres,False,1971-02-09,M
3,11003,11,AW00011003,,Christy,,Zhu,False,1973-08-14,S
4,11004,19,AW00011004,,Elizabeth,,Johnson,False,1979-08-05,S
5,11005,22,AW00011005,,Julio,,Ruiz,False,1976-08-01,S
6,11007,40,AW00011007,,Marco,,Mehta,False,1969-11-06,M
7,11008,32,AW00011008,,Rob,,Verhoff,False,1975-07-04,S
8,11009,25,AW00011009,,Shannon,C,Carlson,False,1969-09-29,S
9,11010,22,AW00011010,,Jacquelyn,C,Suarez,False,1969-08-05,S
