# Join Tables from Two Data Sources

This notebook fetches data tables from 2 different sources and join them on a key column.

### Table of Contents

1.  [Step 1: Fetch data from SAP Datasphere using the SAP HANA Connector](#fetch_data_from_dsp)

1.  [Step 2: Fetch data from Db2 through watsonx.data using the Presto Connector](#fetch_data_from_db2)

1.  [Step 3: Join the data columns in Pandas](#join_data)

1.  [Step 4: Save the data as CSV file](#save_data)

<a id="fetch_data_from_dsp"></a>
## Step 1: Fetch data from SAP Datasphere using the SAP HANA connector

In [1]:
import itc_utils.flight_service as itcfs

readClient = itcfs.get_flight_client()

# NOTE:
#  A limit of 90000 rows has been applied to the request to enable sample previewing.
#  Adjust the display message as needed by editing the following lines:
from IPython.display import display, HTML
display(HTML("A row limit of 90000 has been applied to the query to enable sample previewing. If the data set is larger, only the first 90000 rows will be loaded."))
#  Edit select_statement to change or disable the row limit.
#
data_request_1 = {
    'connected_data_name': """epp_primary_dataset""",
    'interaction_properties': {
        'row_limit': 90000
    }
}

flightInfo = itcfs.get_flight_info(readClient, nb_data_request=data_request_1)

data_df_1 = itcfs.read_pandas_and_concat(readClient, flightInfo, timeout=240)
#data_df_1.columns = data_df_1.columns.str.upper()

data_df_1.head(5)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,any_awards_won,is_promoted
0,36904,Sales & Marketing,region_15,Bachelor's,m,other,1,29,3.0,2,0,0
1,32877,Sales & Marketing,region_2,Bachelor's,f,other,1,40,3.0,12,0,0
2,63026,Sales & Marketing,region_26,Bachelor's,m,other,1,30,3.0,2,0,0
3,24675,Sales & Marketing,region_24,Bachelor's,m,other,1,36,3.0,10,0,0
4,18119,Sales & Marketing,region_15,Bachelor's,m,other,1,42,3.0,12,0,0


<a id="fetch_data_from_db2"></a>
## Step 2: Fetch data from Db2 through watsonx.data using the Presto connector

In [2]:
import itc_utils.flight_service as itcfs

readClient = itcfs.get_flight_client()

# NOTE:
#  A limit of 90000 rows has been applied to the request to enable sample previewing.
#  Adjust the display message as needed by editing the following lines:
from IPython.display import display, HTML
display(HTML("A row limit of 90000 has been applied to the query to enable sample previewing. If the data set is larger, only the first 90000 rows will be loaded."))
#  Edit select_statement to change or disable the row limit.
#
data_request_2 = {
    'connected_data_name': """epp_extra_dataset""",
    'interaction_properties': {
        'row_limit': 90000
    }
}

flightInfo = itcfs.get_flight_info(readClient, nb_data_request=data_request_2)

data_df_2 = itcfs.read_pandas_and_concat(readClient, flightInfo, timeout=240)
#data_df_2.columns = data_df_2.columns.str.upper()

data_df_2.head(5)

Unnamed: 0,employee_id,kpis_met_above_80_percent,avg_training_score
0,65438,1,49
1,65141,0,60
2,7513,0,50
3,2542,0,50
4,48945,0,73


<a id="join_data"></a>
## Step 3: Join the data columns in Pandas

In [3]:
import pandas as pd

merged_data = pd.merge(data_df_1, data_df_2, on=['employee_id'])
merged_data.head(5)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,any_awards_won,is_promoted,kpis_met_above_80_percent,avg_training_score
0,36904,Sales & Marketing,region_15,Bachelor's,m,other,1,29,3.0,2,0,0,0,51
1,32877,Sales & Marketing,region_2,Bachelor's,f,other,1,40,3.0,12,0,0,0,50
2,63026,Sales & Marketing,region_26,Bachelor's,m,other,1,30,3.0,2,0,0,0,50
3,24675,Sales & Marketing,region_24,Bachelor's,m,other,1,36,3.0,10,0,0,0,50
4,18119,Sales & Marketing,region_15,Bachelor's,m,other,1,42,3.0,12,0,0,0,50


In [4]:
merged_data.shape

(54808, 14)

In [5]:
merged_data.columns

Index(['employee_id', 'department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'any_awards_won', 'is_promoted',
       'kpis_met_above_80_percent', 'avg_training_score'],
      dtype='object')

<a id="save_data"></a>
## Step 4: Save the data as CSV file

In [6]:
from ibm_watson_studio_lib import access_project_or_space
wslib = access_project_or_space()

In [7]:
from io import BytesIO

# write the dataframe to a buffer
buffer = BytesIO()
merged_data.to_csv(buffer)

# reset for subsequent reading
buffer.seek(0)

# Save the data to project
assetname="merged_data.csv"
wslib.save_data(assetname, data=buffer.read(), overwrite=True)

{'name': 'merged_data.csv',
 'asset_type': 'data_asset',
 'asset_id': '97037ea1-7550-4d01-8c10-332f1b8f58f2',
 'attachment_id': '0bbb3dc0-33b6-4efb-8553-8cf677420340',
 'filepath': 'data_asset/merged_data.csv',
 'data_size': 4123960,
 'mime': 'text/csv',
 'summary': ['looked up asset',
  'selected attachment',
  'overwritten file',
  'updated attachment'],
 'access_count': 1}