# 0.0 Imports 

In [1]:
# Manipulation
import pandas as pd
import numpy as np 
import csv
import inflection
import datetime

#Read dataset
from pyxlsb import open_workbook as open_xlsb
from openpyxl import Workbook
import xlwt
import xlrd
import xlwt

#Visualization
import seaborn as sns
import matplotlib.pyplot as plt

#Possible Displays
from IPython.display import Image
from IPython.core.display import HTML

# In case of ML (See in the process)
import random
import warnings
import requests


## 0.1 Helper Functions
In this section, we define some functions that will help us along the project

In [2]:
# sets seaborn style
sns.set_style("darkgrid")
sns.set_palette("muted")

# sets matplolit inline
%matplotlib inline

# sets to display unlimited number of columns
pd.options.display.max_columns = None

# setting the title and axis labels
def set_plots_features(ax, title, xlabel, ylabel):
    ax.set_title(title, fontsize=18)
    ax.set_xlabel(xlabel, fontsize=14)
    ax.set_ylabel(ylabel, fontsize=14)
    
# ignores warnings
warnings.filterwarnings('ignore')

# Descriptive Statistics

def get_descriptive_statistics(data_set):
    # central tendency: mean, median
    mean = pd.DataFrame(data_set.apply(np.mean)).T
    median = pd.DataFrame(data_set.apply(np.median)).T
    
    # distribution: std, min, max, range, skew, kurtosis
    std = pd.DataFrame(data_set.apply(np.std)).T
    min_value = pd.DataFrame(data_set.apply(min)).T
    max_value = pd.DataFrame(data_set.apply(max)).T
    range_value = pd.DataFrame(data_set.apply(lambda x: x.max() - x.min())).T
    skewness = pd.DataFrame(data_set.apply(lambda x: x.skew())).T
    kurtosis = pd.DataFrame(data_set.apply(lambda x: x.kurtosis())).T

    # concatenates
    summary_stats = pd.concat(
            [min_value, max_value, range_value, mean, median, std, skewness, kurtosis]).T.reset_index()
    summary_stats.columns = ['attributes', 'min', 'max',
                                 'range', 'mean', 'median', 'std', 'skewness', 'kurtosis']
    return summary_stats

## 0.2  Business Problem

### 0.2.1 -- Business Case – Guidelines

    You are overseeing Data Analytics for one contact center project. You arrive to the office one day, and the contact center manager calls you about near past results. The client believes CSAT is one of the most important metrics (customers are everything for them) and has been complaining as we were not able, in 2018, to keep a good CSAT – in fact CSAT has been decreasing finding the minimum in December – and wanted to know what happened and which actions we will take for the future. 
    
      As a Data Analytics expert, your job is to figure out why we couldn’t achieve good results (previously it has been poorly diagnosed as any action plan worked) and to propose an action plan that can prevent such results from happening again. 
      
 ![image.png](../img/01_csat.png)
 
      As a guideline, here follows the main metrics, targets and formulas:
      
#### • Answer rate - 92% - #AnsweredCalls/ OfferedCalls
#### • CSAT - 85% - #Surveys8to10/#Surveys
#### • DSAT - 8% - #Surveys1to3/#Surveys
#### • SLA email - 85% - #AnsweredLess24H/ #Answered 
      
      Additionally, propose a report template to ensure visibility over key metrics of the project to support stakeholders’ future control & decision.

## 0.3 Understanding the Challenge

### 0.3.1 -- Why ? 
   #### -- What is the type of business of Teleperformance? - 
     Outsourcing and Technology. They mission is to provide customer experience excellency at each interaction opportunity
     
   ![image.png](../img/02_htech-htouch.png)
   
   #### -- Whats the focuses of the company ? 
     The company has three well-defined focuses: customer, innovation and efficiency. 
   #### -- Offer ? (Solutions for Customers)
           -- E-mail 
           -- Calls 
           
   #### -- Enviroment - Call Center
   #### -- Target - Increase Service Level Agreement (SLA)
   #### -- Goals - Find deviations in the procedures used to resolve customer issues.
           1) Deliverys 
           a) Exploratory Data Analysis 
           b) Insights for a better customer experience
           c) Power BI Dashboard
           d) ML Algorithm for predict the next calls occurrences in the next month.
  
    

## 0.4 Understanding the Dataset
This part of the problem is essential!
Try to understand the nature of the columns (What they mean) and then establish the granularity, and know what is essential or not, based on the metrics previously established in 0.2

                                    The first step is UNDERSTAND some important ACRONYMS
                                    but the most part of the columns are auto-explained.
                                    In order to understand the importance of each one and the values
                                


### TABS

#### Sheet 1 - FACT HSPLIT
    -- Columns 
          -> DIM CALENDAR DATE - 
          -> DIM TIME HOUR
          -> DIM TIME MINUTES
          -> DIM TIME. TIME ID
          -> AfterCall Time - HSPLIT
###### a)  After-call work refers to the tasks that an outsourced call center representative completes after they have finished interacting with a customer.
###### b) Average after-call work time is measured by adding the total time spent by a specific representative (or team) over a set period and dividing the sum by the total number of calls over the same timeframe.
          -> AnswerTime - HSPLIT    
###### a) ASA - call center metric for the average amount of time it takes for calls to be answered in a call center during a specific time period.
###### b) Average Speed of Answer = Total Waiting Time for Answered Calls/Total Number of Answered Calls
          -> CALLS ABANDONED - HSPLIT - AUTO EXPLAINED
          -> Answered Calls  AUTO EXPLAINED
          -> Offered Calls - AUTO EXPLAINED
          -> Handle Time - HSPLIT   -AHT 
###### a) AHT is the average time it takes to handle a call or transaction from start to finish – from call initiation, to hold time, to talk time, and all the way through to any related tasks an agent must perform post-phone call to resolve that call.
###### b) It's not a sucess metric when it's analyzed alone because rushing agents to close tickets, rather than resolve issues, would hasten your AHT but would not work for CSAT for example.
###### c) Formula -  AHT, add your total talk time + total hold time + total after-call tasks, and then divide by the number of total calls.
          -> HoldCall - HSSPLIT 
###### a) Hold time is the average time it takes for an operator to answer a call. It's the amount of time a customer waits in the queue before getting a response from an agent.          
          -> I Aux time - HSPLIT -
###### a)Auxiliary time is the time an agent spends in unavailable status in the call center system, or during which the agent cannot accept new calls because they are on a break.
###### b) Aux codes are used to keep track of call center agent time that has been voluntarily used to not accept calls. More specifically, it's used to manage non-call activities of agents.
          -> IDLE TIME - HSPLIT
###### a) Idle time is paid time that an employee, or machine, is unproductive that is a result of factors that can either be controlled or uncontrolled by management. ... Idle time may have serious implications for employers
###### b)Abnormal idle time arises due to power failure, breakdown of machinery, non supply of raw material in time, delay in previous process, strikes and accidents etc. 
###### c) It's cost account, the cost of such idel time is included as either direct labor or manufacturing overhead and is part of the total product cost
          -> MAX DELAY - HSPLIT
###### Maximum time a customer in a queue waits before being connected to an operator or hanging up.
          -> STAFF TIME - HSPLIT 
###### a) It's simply the number of FORECAST CALLS for an hour multiplied by the average handle time of a call. The average handle time (AHT) is made up of two components: actual conversation or talk time plus any after call wrap-up time associated with the call.
          -> TALK TIME - HSPLIT
###### Average Talk Time (ATT) is, quite simply, the amount of time an agent spends talking to customer
          -> TIME 
          -> Type 
          -> SKill ID 

#### Sheet 2 - FACT HAGENT

          -> Short Login 
          -> DIM CALENDAR.DATE.1	
          -> CALLS ABANDONED	
          -> CALLS ANSWERED - AUTO EXPLAINED	
          -> CALLS CONFERENCED 
##### Sheet 3 - A conference call is a telephone call in which someone talks to several people at the same time. The conference call may be designed to allow the called party to participate during the call or set up so that the called party merely listens into the call and cannot speak.
            1)Dial the first call, and wait for the recipient to pick up. ...
            2)Tap the Add Call button.
            3)Dial the second number.
            4)Touch the Merge or Merge Call button.
            5) Repeat steps 2 and 3 to add more callers to the conference.
            
            
          -> CALLS RETURN TO Q DUE TO TIMEOUT	- Auto Explained
          -> CALLS TRANSFERRED	- Auto explained
          -> CONTACT HOLDTIME	- apply Definition above
          -> CONTACT TALK TIME	-apply Definition above
          -> IDLE TIME	- apply Definition above
          -> LOGIN DURATION	 - Mean time for the agent login
          -> POST CALL PROCESSING TIME	
##### a) After-call work includes all tasks which require completion once your contact center’s agents’ phone conversations with customers are finished.
          -> RING TIME - Auto Explained
          -> SCHEDULED TIME
##### a) In simple words, call center agent scheduling includes any and every task to manage agents and their varied chores in such a way that the call center can reach its optimum performance
          -> TOTAL STAFFED TIME
###### a) It's simply the number of forecast calls for an hour multiplied by the average handle time of a call. The average handle time (AHT) is made up of two components: actual conversation or talk time plus any after call wrap-up time associated with the call.
          -> WAIT TIME	
          -> WORKED TIME	
          -> Type	
          -> Skill ID	
          -> LOB
###### Line of business (LOB) is a general term which refers to a product or a set of related products that serve a particular customer transaction or business need.

#### Sheet 3 - FACT SERVREQ
          -> DIM AGENT.LOGIN	
          -> DIM CALENDAR.DATE.1	
          -> Resolution Name	
          -> DIM TIME.HOUR	
          -> DIM TIME.MINUTES	
          -> DIM TIME.TIME ID	
          -> Incidents Created	
          -> Incidents Updated	
          -> Time	
          -> Tier - In the next steps we gonna perform queries for understand these tiers
##### Tier 1: This is the organization's “first line of defense,”. ... Tier 2: When a customer issue is beyond the skill of the Tier 1 staff to resolve, the issue escalates to Tier 2. Tier 2 staff have the knowledge base and skills to handle more complex customer issues and will often use remote control tools

#### Sheet 4 - FACT EMAIL

          -> Agent Login	
          -> Closed Reason	
          -> Avg. Time Allocated	- AHT 
##### a) Average handle time (AHT) is a metric that's commonly used as a key performance indicator (KPI) for call centers. It represents the average length of contact for a customer on a cal
          -> AVG_TIME_ARRIVE_TO_CLOSE	
          -> SLA	
##### a) A service-level agreement (SLA) defines the level of service you expect from a vendor, laying out the metrics by which service is measured, as well as remedies or penalties should agreed-on service levels not be achieved.
          -> COUNT ARRIVAL	
          -> COUNT CLOSED	
          -> COUNT FIRST OPENED	
          -> COUNT OPENED	
          -> TIME SINCE ARRIVED TO CLOSE	
          -> Date	
          -> Emails within SLA
##### a) A service level agreement (SLA) is a part of a contract where the level of service is formally defined. ... It is common to append an SLA to your contract with your email service provider, regardless of whether it's a hosted service or an in-house solution

#### Sheet 5 - CSAT 

    CSAT is short for Customer Satisfaction, which is a commonly-used key performance indicator used to track how satisfied customers are with your organization’s products and/or services
    CSAT is measured by one or more variations of this question that usually appears at the end of a customer feedback survey:

    “How would you rate your overall satisfaction with the [goods/service] you received?”

    Respondents use the following 1 to 5 scale:
1. Very unsatisfied
2. Unsatisfied
3. Neutral
4. Satisfied
5. Very satisfied

Calculating CSAT
To do this, only responses of 8 (satisfied) to 10 (very satisfied) are included in the calculation, as it has been shown that using the two highest values on feedback surveys is the most accurate predictor of customer retention.

(Number of satisfied customers (8 to 10) / Number of survey responses) x 100 = % of satisfied customers

## 0.5 Loading Data
We have an Xlsb file with 5 tabs to be analyzed.
The argument sheet_name make us possible to extract each tab.
Then we gonna save all the tabs in individual csv files for be able to manipulate one by one, after this process we can select 

In [3]:
df_raw = pd.read_excel('../data/Case_Study_Data.xlsb', engine='pyxlsb')
#There some other ways

# Extract Tab 1 - FACT HSPLIT
df_raw_tab1 = pd.read_excel('../data/Case_Study_Data.xlsb', engine='pyxlsb', sheet_name='FACT HSPLIT')
#Extract Tab 2 - FACT HAGENT
df_raw_tab2 = pd.read_excel('../data/Case_Study_Data.xlsb', engine='pyxlsb', sheet_name='FACT HAGENT')
# Extract Tab 3 - FACT SERVREQ
df_raw_tab3 = pd.read_excel('../data/Case_Study_Data.xlsb', engine='pyxlsb', sheet_name='FACT SERVREQ')
# Extract Tab 4 FACT EMAIl
df_raw_tab4 = pd.read_excel('../data/Case_Study_Data.xlsb', engine='pyxlsb', sheet_name='FACT EMAIL')
# Extract Tab 5 - CSAT
df_raw_tab5 = pd.read_excel('../data/Case_Study_Data.xlsb', engine='pyxlsb', sheet_name='CSAT')    

In [7]:
# TO CSV 
df_raw_tab1.to_csv('FACT_HSPLIT.csv')

KeyError: 'FACT HSPLIT'

### 0.5.1 Dealing with Sheets 
At this point we already have all the tabs in different CSV's, for consume less memory and work with one DataFrame only!

In [8]:
df_r = pd.read_excel('../data/Case_Study_Data.xlsb', engine='pyxlsb',sheet_name=['FACT HSPLIT',
                                                                                 'FACT HAGENT', 
                                                                                 'FACT SERVREQ', 
                                                                                 'FACT EMAIL', 
                                                                                 'CSAT'])

# FACT HSPLIT
df_raw_tb1 = df_r['FACT HSPLIT']
#FACT HAGENT
df_raw_tb2 = df_r['FACT HAGENT']
# FACT SERVREQ
df_raw_tb3 = df_r['FACT SERVREQ']
# FACT EMAIL
df_raw_tb4 = df_r['FACT EMAIL']
# CSAT
df_raw_tb5 = df_r['CSAT']

# 1.0 DATA DESCRIPTION & MANIPULATION


In [None]:
CADA ABA DESSE DATASET 
SEPARAR AS ABAS EM DATAFRAMES DIFERENTE

## 1.1 Renaming Columns

## 1.2 Checking Data Dimensions

## 1.3 Checking Data Types

## 1.4 Checking NaN Values

## 1.5 Filling out NaN Values

## 1.6 Changing Data Types

## 1.7 Descriptive Statistics

### 1.7.1 Numerical Attributes

### 1.7,2 Categorical Attributes

# 2.0 FEATURE ENGINEERING

### 2.0.1 Project Checkpoint

# 2.1 Hypothesis MindMap 

## 2.2 Creating Hypothesis

### New Features