<header style="padding:10px;background:#f9f9f9;border-top:3px solid #00b2b1"><img id="Teradata-logo" src="https://www.teradata.com/Teradata/Images/Rebrand/Teradata_logo-two_color.png" alt="Teradata" width="220" align="right" />

## NewSQL Analytic Functions 
### Load, Prepare and Analyze Data At Scale
</header>

**Contenido**
1. Connect to the Vantage environment. Replace any hosts, schemas, usernames, etc. as necessary.
2. Create and Load Tables from source data sets.
3. Basic Data Discovery.  Metadata and feature analysis
4. Working with Data - aggregations, joins, and basic transformations.
5. Advanced Data Preparation - column assignments using complex functions.
6. Visualizing results.

## Architecture of the Teradataml package
Teradataml seeks to provide easy-to-use interfaces for working with data that resides in a Teradata Vantage system.  Typically, Python developers will make use of common, powerful data management functions using libraries like python pandas.  Teradataml extends these same functions to the Teradata ecosystem, allowing users to apply straightforward, powerful analytics and data manipulation functions that leverage the full power and scale of Vantage without data movement or limitations on client resources without writing complex SQL.

**Referencias**
* Python Package User Guide: https://docs.teradata.com/r/1YKutX2ODdO9ppo_fnguTA/root
* Teradataml Python Reference: https://docs.teradata.com/r/xLnbN80h9C6037gi3ildag/root


# Section 1.  Connecting to the Database
With Teradataml, we create a connection context with parameters controlling default schemas, temporary database for creating views, etc.

In [None]:
%connect local, hidewarnings=true

# Section 2. - Create and Load Tables

## Load Sample Data from plain Files

In [None]:
CREATE TABLE CALL_CENTER_CALLS
(
    CALL_ID VARCHAR(20),
    CUSTOMER_ID DECIMAL(18,0),
    CC_REP_ID DECIMAL(18,0),
    CC_CALL_DT TIMESTAMP(6),
    CALL_TYPE VARCHAR(50)
);

In [None]:
%dataload DATABASE=demo_user, TABLE=CALL_CENTER_CALLS, skiprows=1, FILEPATH=new-sql/data/CALL_CENTER_CALLS.csv

In [None]:
CREATE TABLE STORE_VISIT
(
    STORE_ID DECIMAL(18,0),
    CUSTOMER_ID DECIMAL(18,0),
    VISIT_DT TIMESTAMP(6),
    ACTION VARCHAR(50)
);

In [None]:
%dataload DATABASE=demo_user, TABLE=STORE_VISIT, skiprows=1, FILEPATH=new-sql/data/STORE_VISIT.csv

In [None]:
CREATE TABLE WEB 
     (
     CUSTOMER_ID DECIMAL(18,0) NOT NULL,
     SERVER_ID VARCHAR(5) NOT NULL,
     PAGE VARCHAR(50),
     BROWSE_ID VARCHAR(20)
     )
PRIMARY TIME INDEX (TIMESTAMP(6), DATE '2016-01-01', MINUTES(1), COLUMNS (SERVER_ID), NONSEQUENCED); 

In [None]:
%dataload DATABASE=demo_user, TABLE=WEB, skiprows=1, FILEPATH=new-sql/data/WEB.csv

In [None]:
CREATE MULTISET TABLE CUSTOMER
(
    CUSTOMER_ID DECIMAL(18,0) NOT NULL,
    F_NAME VARCHAR(30),
    L_NAME VARCHAR(30),
    CUST_ZIP VARCHAR(5),
    VALIDITY VARCHAR(50),
    GENDER CHAR(1),
    CHURN_FLAG VARCHAR(1),
    CUS_LONG FLOAT,
    CUS_LAT FLOAT
);

In [None]:
%dataload DATABASE=demo_user, TABLE=CUSTOMER, skiprows=1, FILEPATH=new-sql/data/CUSTOMER.csv

In [None]:
CREATE TABLE STORE
(
    STORE_ID DECIMAL(18,0) NOT NULL,
    STORE_DESC VARCHAR(80),
    STORE_ZIP VARCHAR(5),
    STORE_LOCATION ST_GEOMETRY
)
INDEX(STORE_LOCATION);

In [None]:
%dataload DATABASE=demo_user, TABLE=STORE, skiprows=1, FILEPATH=new-sql/data/STORE.csv

In [None]:
CREATE MULTISET TABLE SERVER
(
    SERVER_ID VARCHAR(5) NOT NULL,
    SERVER_ZIP VARCHAR(5),
    SERVER_LOCATION ST_GEOMETRY
)
INDEX (SERVER_LOCATION);

In [None]:
%dataload DATABASE=demo_user, TABLE=SERVER, skiprows=1, FILEPATH=new-sql/data/SERVER.csv

# Section 3 - Data Discovery

### Look at table statistics, sample data, simple exploration

In [None]:
SEL * FROM CUSTOMER SAMPLE 10;

In [None]:
SELECT * FROM TD_ColumnSummary (
  ON CUSTOMER AS InputTable
  USING
  TargetColumns ('L_NAME','GENDER','CHURN_FLAG','CUS_LONG','CUS_LAT')
) AS dt;

In [None]:
SELECT * FROM TD_CategoricalSummary (
  ON CUSTOMER AS InputTable
  USING
  TargetColumns ('GENDER','CHURN_FLAG')
) AS dt ORDER BY 1,2;

In [None]:
SELECT * FROM TD_UnivariateStatistics (
  ON CUSTOMER AS InputTable
  USING
  TargetColumns ('CUS_LONG','CUS_LAT')
) AS dt ORDER BY 1,2;

# Section 4 - Data Analysis

### Data Aggregations and Plotting

In [None]:
SELECT CALL_TYPE, COUNT(1) N FROM CALL_CENTER_CALLS GROUP BY 1 ORDER BY 2 DESC;

In [None]:
%chart CALL_TYPE, N

In [None]:
%chart CALL_TYPE, N, Title= FREQ BY CALL TYPE, Height=400, Width=800

### Building the Customer Events Table (Customer 360)

In [None]:
CREATE TABLE RETAIL_COMPNEW
(CUSTOMER_ID,DATESTAMP,EVENT,CHURN_FLAG) AS 
(SELECT CU.CUSTOMER_ID AS CUSTOMER_ID
    ,CC.CC_CALL_DT AS DATESTAMP
    ,CC.CALL_TYPE AS EVENT
    ,CU.CHURN_FLAG AS CHURN_FLAG
FROM CUSTOMER CU, CALL_CENTER_CALLS CC
WHERE CU.CUSTOMER_ID = CC.CUSTOMER_ID
UNION ALL
SELECT CU.CUSTOMER_ID AS CUSTOMER_ID
    ,ST.VISIT_DT AS DATESTAMP
    ,ST.ACTION AS EVENT
    ,CU.CHURN_FLAG AS CHURN_FLAG
FROM CUSTOMER CU, STORE_VISIT ST
WHERE CU.CUSTOMER_ID = ST.CUSTOMER_ID
UNION ALL
SELECT CU.CUSTOMER_ID AS CUSTOMER_ID
    ,WB.TD_TIMECODE AS DATESTAMP
    ,WB.PAGE AS EVENT
    ,CU.CHURN_FLAG AS CHURN_FLAG
FROM CUSTOMER CU, WEB WB
WHERE CU.CUSTOMER_ID = WB.CUSTOMER_ID)
WITH DATA;

In [None]:
SELECT * FROM RETAIL_COMPNEW WHERE CUSTOMER_ID=64497 ORDER BY DATESTAMP; 

In [None]:
SELECT COUNT(DISTINCT CUSTOMER_ID) NCLI, COUNT(1) N FROM  RETAIL_COMPNEW;

In [None]:
SELECT EVENT, COUNT(1) N FROM  RETAIL_COMPNEW GROUP BY 1 ORDER BY 2 DESC;

### Sessionization

In [None]:
CREATE MULTISET TABLE DAILY_SESSIONS_CHURN AS(
    SELECT * FROM SESSIONIZE
         (ON (SELECT * FROM RETAIL_COMPNEW WHERE CHURN_FLAG = 'Y')
             PARTITION BY CUSTOMER_ID ORDER BY DATESTAMP
               USING
               TIMECOLUMN('DATESTAMP')
               TIMEOUT(604800) 
               ) AS DT) WITH DATA;

In [None]:
SELECT * FROM  DAILY_SESSIONS_CHURN ORDER BY 2 ASC WHERE CUSTOMER_ID=64497;

### nPath Analysis
Building Event Sequences leading to Product Return

In [None]:
CREATE MULTISET TABLE NPATH_CHURN AS (SELECT * FROM NPATH
(ON DAILY_SESSIONS_CHURN PARTITION BY CUSTOMER_ID, SESSIONID ORDER BY DATESTAMP
    USING
    MODE(NONOVERLAPPING)
    PATTERN('E{1,4}.C')
    SYMBOLS
    (EVENT = 'Product Return'  AS C 
     ,EVENT <> 'Product Return' AS E)
    RESULT
    (   FIRST(CUSTOMER_ID OF ANY(E,C))    AS CUSTOMER_ID
       ,FIRST(DATESTAMP OF ANY(E,C))    AS DS_START
       ,LAST(DATESTAMP OF ANY(E,C))     AS DS_END
       ,COUNT(* OF E)                   AS EVENT_CNT
      ,ACCUMULATE(EVENT OF ANY(E,C))    AS PATH)
) AS DT) WITH DATA;

In [None]:
SELECT * FROM NPATH_CHURN WHERE CUSTOMER_ID=64497;

### Most Frequent Routes to Product Return

In [None]:
CREATE TABLE PATH_TAB AS
(SELECT TOP 100 path, COUNT(*) AS cnt
FROM NPATH_CHURN
GROUP BY path
ORDER BY cnt DESC)
WITH DATA;

In [None]:
SELECT TOP 10 * FROM PATH_TAB ORDER BY cnt DESC; 

### Finish the Demo

In [None]:
DROP TABLE CALL_CENTER_CALLS;

In [None]:
DROP TABLE STORE_VISIT;

In [None]:
DROP TABLE WEB;

In [None]:
DROP TABLE STORE;

In [None]:
DROP TABLE CUSTOMER;

In [None]:
DROP TABLE CUST_COMMENT;

In [None]:
DROP TABLE SERVER;

In [None]:
DROP TABLE RETAIL_COMPNEW;

In [None]:
DROP TABLE DAILY_SESSIONS_CHURN;

In [None]:
DROP TABLE NPATH_CHURN;

In [None]:
DROP TABLE PATH_TAB;