# Load library

In [1]:
### visualization
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.offline as py
import matplotlib as mat
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
from bokeh.models import Range1d, LinearAxis, NumeralTickFormatter, ColumnDataSource, ranges, LabelSet, Legend, SingleIntervalTicker, LinearAxis
from bokeh.layouts import row
from bokeh.charts import Bar
from bokeh.palettes import brewer
py.init_notebook_mode(connected=True)



The bokeh.charts API has moved to a separate 'bkcharts' package.

This compatibility shim will remain until Bokeh 1.0 is released.
After that, if you want to use this API you will have to install
the bkcharts package explicitly.




In [2]:
# data analysis and wrangling
import numpy as np
import pandas as pd
import re
import jieba
from pandas.io.json import json_normalize
from functools import partial

In [3]:
# other library
import datetime
import calendar

In [4]:
# ignore sys warning
import warnings
import os
import glob
warnings.filterwarnings('ignore')

# Load Questionnaire data from API

有些國外問卷服務網站有提供API讓你獲取問卷資料，因此我們第一步便是透過API取得所需的資訊。

In [5]:
def nps_api_1993(result=[], offset=0):
    # Import requests package
    import requests
    from pprint import pprint
    import json
    import pandas as pd
    import numpy as np
    from pandas.io.json import json_normalize
    import re
    import jieba
    from time import time
    import pickle
    
    
    page = 'your api page'
    try:
        r = requests.get(page)
    except (requests.Timeout, requests.ConnectionError, requests.HTTPError, requests.RequestException) as warning:
        print('page failed')
    else:
        json_data = r.json()
        for i in range(len(json_data['responses'])):
            results = json_data['responses'][i]
            data = results['answers']
            data['email'] = results['hidden']['email']
            data['date'] = results['hidden']['first']
            data['phone'] = results['hidden']['phone']
            result.append(data)
    finally:
        total = json_data['stats']['responses']['completed']
        #print('all:', total, '  now:', len(result))
        if len(result) != total:
            offset+=1000
            return nps_api_1993(result=result, offset=offset)
        else:
            return result

再來要將取得的json檔轉換成data frame

In [6]:
def nps_df_1993():
    # Import requests package
    import requests
    from pprint import pprint
    import json
    import pandas as pd
    import numpy as np
    from pandas.io.json import json_normalize
    import re
    import jieba
    from time import time
    import pickle
    
    result = nps_api_1993()
    
    df = pd.DataFrame()
    for i in range(len(result)):
        results = json_normalize(result[i])
        df = df.append(results, ignore_index=True)
    df.columns = ['date', 'email', 'age', 'exp', 'score', 'phone', 'comment']
    df = df[['date', 'age', 'email', 'phone', 'exp', 'score', 'comment']]
    return df

In [7]:
df = nps_df_1993()
#print(df.shape)
#df.head()

# Adjust time format

由於問卷歷經兩次改版，導致時間格式部分有些異常，因此我們在此先做預處理以便後續分析

In [8]:
def dtype(rows):
    if rows['date'] is None:
        val = 'no_record'
    elif rows['date'] == 'xxxxx':
        val = 'no_record'
    else:
        val = rows['date'][:10]
        if len(val) == 8:
            val = val + '01'
    return val

In [9]:
df['date'] = df.apply(dtype, axis=1)
df = df.loc[df.date != 'no_record']
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['month'] = df['date'].astype(str).str[:8] + '01'
df['month'] = pd.to_datetime(df['month'], format='%Y-%m-%d')
#print(df.shape)
#df.head()

# Assign nps group

根據NPS的算法，我們第一步是將顧客根據評分做分群，這邊我們定義一個方程來執行 

In [10]:
def nps_diff(rows):
    score = int(rows['score'])
    if score >= 9:
        val = 'pro'
    elif score <= 6:
        val = 'det'
    else:
        val = 'neu'
    return val

In [11]:
df['class'] = df.apply(nps_diff, axis=1)

In [12]:
#df.head()

# Calculate nps score

分好族群後，我們根據NPS的公式來計算

In [13]:
df_group = df.groupby(['month', 'class'])['email'].count()  # 根據每月來計算各族群人數

In [14]:
df_group = df_group.reset_index()

In [15]:
df_group.columns = ['month', 'class', 'num']

In [16]:
df_all = df.groupby(['month'])['email'].count()

In [17]:
df_all = df_all.reset_index()

In [18]:
df_gather = pd.merge(left=df_group, right=df_all, how='left', on = ['month'])

In [19]:
df_gather.columns = ['month', 'class', 'num', 'all']

In [20]:
df_gather['per'] = df_gather['num'] / df_gather['all']  #計算各群百分比

In [21]:
df_gather['cumper'] = np.round(df_gather.groupby(['month']).cumsum().reset_index()['per'], 2) #計算累積機率 > 畫圖用的
df_gather = df_gather.loc[df_gather['month'] != 'NaT']
df_gather = df_gather.iloc[:(len(df_gather.index)-3), :]
df_gather['month'] = df_gather['month'].astype(str)
df_gather.tail()

Unnamed: 0,month,class,num,all,per,cumper
49,2017-08-01,neu,114,259,0.440154,0.81
50,2017-08-01,pro,49,259,0.189189,1.0
51,2017-09-01,det,68,201,0.338308,0.34
52,2017-09-01,neu,93,201,0.462687,0.8
53,2017-09-01,pro,40,201,0.199005,1.0


In [22]:
df_spread = pd.pivot_table(df_gather, index=['month'], columns=['class'], values=['per']) #以族群作為欄名

In [23]:
df_spread = df_spread.reset_index()

In [24]:
df_spread.columns = df_spread.columns.droplevel()
df_spread.columns = ['month', 'det', 'neu', 'pro']
df_spread['nps'] = np.round((df_spread['pro'] - df_spread['det']) * 100, 2)  # 計算分數
df_spread['month'] = df_spread['month'].astype(str)

In [25]:
df_spread.tail()

Unnamed: 0,month,det,neu,pro,nps
13,2017-05-01,0.320794,0.480669,0.198537,-12.23
14,2017-06-01,0.331226,0.46397,0.204804,-12.64
15,2017-07-01,0.29386,0.528509,0.177632,-11.62
16,2017-08-01,0.370656,0.440154,0.189189,-18.15
17,2017-09-01,0.338308,0.462687,0.199005,-13.93


# Show NPS  

這邊我們畫出NPS，方便我們快速判斷上個月的客戶體驗是否變差

In [26]:
output_notebook()
#source = ColumnDataSource(dict(x=df_spread['month'].tolist(), y=df_spread['nps'].tolist()))
source2 = ColumnDataSource(dict(x=df_gather['month'].tolist(), 
                               y=df_gather['cumper'].tolist(),
                               val=df_gather['num'].tolist()))
p = Bar(df_gather, label='month', values='per', agg='mean', stack='class',
        title="NPS BY MONTH", legend='top_right', ylabel='percent', color=brewer['Set1'][3])
p.yaxis[0].formatter = NumeralTickFormatter(format="0.0%")
p.extra_y_ranges = {"percent": Range1d(start=-30, end=0)}
p.line(x=df_spread['month'] ,y=df_spread['nps'], color="black", y_range_name="percent", line_width=2)
p.circle(df_spread['month'], df_spread['nps'], fill_color="black", y_range_name="percent", size=8)
labels = LabelSet(x='x', y='y', text='val', 
                  level='glyph', 
                  x_offset=-10, y_offset=-20, 
                  text_font_size='10pt',
                  text_font_style='bold',
                  text_color='white',
                  source=source2, 
                  render_mode='canvas')
p.add_layout(labels)
p.add_layout(LinearAxis(y_range_name="percent"), 'right')
p.yaxis[0].formatter = NumeralTickFormatter(format="0.0%")
p.legend.location = "bottom_right"
show(p, notebook_handle=True)

W-1005 (SNAPPED_TOOLBAR_ANNOTATIONS): Snapped toolbars and annotations on the same side MAY overlap visually: Chart(id='d9d39ba2-ff94-42ae-a3b6-f000d257d52d', ...)


![NPS EXAMPLE](NPS.png)