<a href="https://colab.research.google.com/github/gabrielborja/parc_de_montjuic/blob/main/quarterly_survey.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Survey data preparation

## Importing libraries

In [None]:
#Upgrading Plotly
!pip install plotly --upgrade

In [None]:
# Importing python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Importing visualization libraries
import plotly.express as px
import seaborn as sns
from ipywidgets import interact

## Loading Data

In [None]:
# Loading data from local drive
from google.colab import files
uploaded1 = files.upload()

In [None]:
# Storing loaded data to a pandas dataframe
import io
df1 = pd.read_excel((io.BytesIO(uploaded1['2022_q3_data.xlsx'])))

In [None]:
# Chainging columns to lower case
df1.rename(columns={i:i.lower() for i in df1.columns}, inplace=True)

In [None]:
# Dropping unnecessary columns
df1.drop(columns=['questionnairename','questionnaireid','load_process','load_dts'], axis=1, inplace=True)

In [None]:
# Removing white space from text column
df1 = df1.assign(q4 = df1['q4'].str.strip())

In [None]:
df1.info()

In [None]:
df1.iloc[:,5:].tail()

In [None]:
# Asserting the loaded data has the correct columns
def check_columns():
  survey_col = ['answer_form_id', 'is_anonymous',  'cust_domain_id', 'mh_customer_id', 'mh_context', 'channel_name', 'broadcast_id', 'completed',
                'answered_timestamp', 'Q1', 'Q2_A1',  'Q2_A2', 'Q2_A3', 'Q2_A4', 'Q2_A5', 'Q2_A7', 'Q2_A8', 'Q2_A9', 'Q3', 'Q4_A1', 'Q4_A2', 'Q5']
  assert df1.columns.to_list() == survey_col, f'Columns should be: {[i for i in survey_col]}'
  print(f'The file contains all correct columns')

#check_columns() #=> Columns subject to change due to migration

## Data cleaning

In [None]:
# Creating function to clean up the dataframe
def df1_clean_up(df):
  # Converting timestamps to datetime
  #df = df.assign(mh_context = pd.to_datetime(df['mh_context']),
  #               answered_timestamp = pd.to_datetime(df['answered_timestamp']))
  # Converting timestamps to datetime
  df = df.sort_values(by='answereddatetime').reset_index(drop=True)
  # Assigning categories from datetime
  df = df.assign(year = df['answereddatetime'].dt.year,
                 quarter = df['answereddatetime'].dt.quarter,
                 month = df['answereddatetime'].dt.month,
                 month_name = df['answereddatetime'].dt.month_name(),
                 weeknum = df['answereddatetime'].dt.isocalendar().week,
                 day = df['answereddatetime'].dt.day_name(),
                 hour = df['answereddatetime'].dt.hour
                 )
  # Replacing quarter numbers to string
  quarter_dict = {1: 'Q1', 2: 'Q2', 3: 'Q3', 4: 'Q4'}
  df['quarter'].replace(quarter_dict, inplace=True)
  # Creating year-quarter category
  df = df.assign(period = df['year'].astype(str) + "-" + df['quarter'])
  # Replacing non unicode characters
  df.replace(to_replace=r'Ã¥', value='å', regex=True, inplace=True)
  df.replace(to_replace=r'Ã¦', value='Å', regex=True, inplace=True)
  df.replace(to_replace=r'Ã.', value='ø', regex=True, inplace=True)
  # Replacing X values with 1
  x_cols = ['q2_a1', 'q2_a2', 'q2_a3', 'q2_a4', 'q2_a5', 'q2_a6', 'q2_a7', 'q2_a8', 'q2_a9', 'q4']
  df[x_cols] = df[x_cols].copy().replace('X', value=1)
  return df

In [None]:
# Cleaning up the dataframe
df1 = df1_clean_up(df1)

In [None]:
# Melting dataframe questions to long format
melt1_cols = ['answerformid', 'domainkey', 'answereddatetime', 'year',	'quarter',	'period', 'month',	'month_name', 'weeknum',	'day']
melt2_cols = ['q1', 'q2_a1',  'q2_a2', 'q2_a3', 'q2_a4', 'q2_a5', 'q2_a6', 'q2_a7', 'q2_a8', 'q2_a9', 'q3', 'q4', 'q5']
df1_melt = pd.melt(df1, id_vars=[i for i in melt1_cols],
        value_vars=[i for i in melt2_cols], var_name='question', value_name='answer').dropna(axis=0, subset=['answer'])

In [None]:
# Splitting melted dataframe into survey sections
df1_a = df1_melt[df1_melt['question'].isin(['q2_a1','q2_a2','q2_a3','q2_a4','q2_a5','q2_a6','q2_a7','q2_a8','q2_a9'])].reset_index(drop=True)
df1_b = df1_melt[df1_melt['question'].isin(['q4'])].reset_index(drop=True)
df1_c = df1_melt[df1_melt['question'].isin(['q1', 'q3', 'q5'])].reset_index(drop=True)

In [None]:
# Creating summary table for multiple choice questions
def summary_table_1a(df):
  pv = df.pivot_table(index='question', columns=['period'], values='answer', aggfunc='sum').reset_index()
  dict_1a = {'q2_a1':'Bedre_tilbud', 'q2_a2':'Tvangsavvikling', 'q2_a3':'Privat_abo', 'q2_a4':'Talekvalitet/Dekning',
             'q2_a5':'Surfekvalitet', 'q2_a6':'Streamingkvalitet', 'q2_a7':'Utlandstjenesten', 'q2_a8':'Kundeservice', 'q2_a9':'Annet'}
  pv['question'].replace(to_replace=dict_1a, inplace=True)
  return pv
  
tb_1a = summary_table_1a(df1_a)
tb_1a

In [None]:
# Creating summary table for true/false question
def summary_table_1b(df):
  pv = df.pivot_table(index='answer', columns=['period'], values='question', aggfunc='count').reset_index()
  #dict_1b = {'q4_a1':'Ble_kontaktet', 'q4_a2':'Initiativ_selv'}
  #pv['question'].replace(to_replace=dict_1b, inplace=True)
  pv.rename(columns={'answer':'question'}, inplace=True)
  return pv

tb_1b = summary_table_1b(df1_b)
tb_1b

In [None]:
# Creating function to compute totals for unique survey IDs in the period
def compute_total_by_quarter(df):
  total_dict = {i:len(df[df['period']==i]['answerformid'].unique()) for i in df['period'].unique().tolist()}
  return pd.DataFrame(total_dict.items(), columns=['period', 'total']).set_index('period').T

tb_1t = compute_total_by_quarter(df1_a)
tb_1t

In [None]:
# Creating melted table for true/false question
def melt_table_1t(df):
  return df.T.reset_index()
  #return df_m[df_m['period']!=df_m['period'].unique()]
  
tb_1t_m = melt_table_1t(tb_1t)
tb_1t_m.tail()

In [None]:
# Creating melted table with totals for multiple choice questions
def melt_table_w_totals(df):
  df_m = pd.melt(df, id_vars='question', value_vars=[i for i in df.columns if i !='question'], value_name='answer')
  df_m = pd.merge(df_m, tb_1t_m, how='inner', on='period')
  df_m = df_m.assign(perc = round(df_m['answer']/df_m['total'], 2))
  return df_m
  
tb_1a_m = melt_table_w_totals(tb_1a)
tb_1a_m = tb_1a_m.assign(dekning = np.where(tb_1a_m['question'].isin(['Talekvalitet/Dekning','Surfekvalitet','Streamingkvalitet']), 'Dekning', 'Annet'))
tb_1a_m.tail()

In [None]:
# Creating melted table with totals for true/false question
tb_1b_m = melt_table_w_totals(tb_1b)
tb_1b_m.tail()

## Free text parsing

In [None]:
# Listing stop words
stop_words = ['annen','jeg','på','og','ikke','å','har','til','det','i','som','med','dere','for','fra','er','en','var','at','ice','om','ble',
              'vi','så','sa','hos','et','meg','da','de','fikk','enn','av','kan','men','mye','når','bare','etter','våre', 'ved','hatt','',
              'kunne', 'hadde', 'andre', 'også', 'sett', 'før', 'over', 'den','der', 'deres', 'disse', 'seg', 'noe']

In [None]:
# Stemming words of interest to single root
def word_stemming(df, period, question, top):
  """ Stemming words of interest to single root from questions: Q1, Q3, Q5 """
  


  tx_list = df[(df['question']==question) & (df['period']==period)]['answer']
  return

In [None]:
# Creating function to parse free text for analysis
def parse_free_text(df, period, question, top):
  """ Parse free text from questions: Q1, Q3, Q5 """
  tx_list = df[(df['question']==question) & (df['period']==period)]['answer'].str.lower().replace(r'[!.,"]', '', regex=True).str.cat(sep='').split(' ')
  tx_list = pd.DataFrame(data=tx_list, columns=['word'])
  tx_list = tx_list[~tx_list['word'].isin(stop_words)]
  cond_list = [tx_list['word'].str.contains('dek'), tx_list['word'].str.contains('data'), tx_list['word'].str.contains('nett'), tx_list['word'].str.contains('signal'),
               tx_list['word'].str.contains('samtale'), tx_list['word'].str.contains('ring'), tx_list['word'].str.contains('gb'),
               tx_list['word'].str.contains('pris'), tx_list['word'].str.contains('tilbud'), tx_list['word'].str.contains('kr'), tx_list['word'].str.contains('billig'),
               tx_list['word'].str.contains('nok'), tx_list['word'].str.contains('fri'), tx_list['word'].str.contains('betale'),
               tx_list['word'].str.contains('kunde'), tx_list['word'].str.contains('servi')]
  choice_list = ['dekning','dekning','dekning','dekning','dekning','dekning','dekning','pris/tilbud','pris/tilbud','pris/tilbud','pris/tilbud',
                 'pris/tilbud','pris/tilbud','pris/tilbud','kundeservice','kundeservice']
  tx_col = tx_list.assign(category = np.select(cond_list, choice_list, 'annet')).reset_index(drop=True) #=> Assigns categories from stemming words
  tx_top = pd.DataFrame(tx_col.value_counts(), columns=['counter']).reset_index()#[:top]
  tx_top = tx_top.assign(perc = round(tx_top['counter']/sum(tx_top['counter']),2)) #=> Aggregates for choice list categories
  tx_top.insert(0,'period', period)
  return (tx_col, tx_top)

In [None]:
# Testing new text parsing algorithm
import collections
#x = df1_c[(df1_c['question']=='Q5') & (df1_c['period']=='2022-Q2')][['mh_context','period','answer']].reset_index(drop=True).copy()

#dek_list = ['dekningen','dekning?','data','datapakker','nett','nettverk','nettet','signal','samtale','mobildekning','ring','ringe','gb','områder','hastighet','bredbånd',
#            'kvalitet','telenordekning','telenor-nett']
#dek_dict = {f'({i})': 'dekning' for i in dek_list}

#pris_list = ['betale', 'billig', 'billigere', 'datamengde', 'datamengder', 'datavolum', 'dyr', 'dyrt', 'ekstra', 'fri', 'gebyr', 'kr', 'lavere','mobildata','nok',
#             'pakke', 'pakken', 'pris','priser', 'tilbud', 'tilbudet', 'totalpakken', 'totalpris']
#pris_dict = {f'({p})': 'pris_tilbud' for p in pris_list}

#x = x.assign(answer = x['answer'].str.lower().str.strip())
#x = x.assign(answer = x['answer'].replace(r'[!.,"?]', '', regex=True))
#x = x.assign(answer = x['answer'].replace(dek_dict, regex=True))
#x = x.assign(answer = x['answer'].replace(pris_dict, regex=True))
#x.head(15)

#=> First replace every word by its root, then clean, split and Counter

In [None]:
# Parsing free text from question question 1
(df1_d, df1_e) = parse_free_text(df1_c, '2022-Q3', 'q1', 20)

In [None]:
# Text dataframe from question 1 in long format for word cloud
df1_d.tail()

In [None]:
# Text dataframe from question 1 aggregated by frequency
df1_e.head()

In [None]:
# Parsing free text from question Q5
(df1_f, df1_g) = parse_free_text(df1_c, '2022-Q3', 'q5', 1)

In [None]:
# Text dataframe from Q5 in long format for word cloud
df1_f.tail()

In [None]:
# Text dataframe from Q5 in long format for word cloud
df1_g

## Exporting results to local drive

In [None]:
# Exporting files to excel
with pd.ExcelWriter('2022_q3_cleaned_survey.xlsx', engine='openpyxl') as writer:
  df1.to_excel(writer, sheet_name='00_clean_survey', index=False)
  df1_a.to_excel(writer, sheet_name='01_num_data_q2', index=False)
  tb_1a.to_excel(writer, sheet_name='02_table_q2', index=False)
  df1_b.to_excel(writer, sheet_name='03_num_data_q4', index=False)
  tb_1b.to_excel(writer, sheet_name='04_table_q4', index=False)
  tb_1t.to_excel(writer, sheet_name='05_period_totals', index=False)
  df1_c.to_excel(writer, sheet_name='06_text_data', index=False)
  df1_d.to_excel(writer, sheet_name='07_text_ind', index=False)
  df1_e.to_excel(writer, sheet_name='08_q3', index=False)
  df1_g.to_excel(writer, sheet_name='09_q5', index=False)
  
files.download('2022_q3_cleaned_survey.xlsx')

## Data Visualization

In [None]:
# List of questions for interactive plot
year_list = df1['year'].unique().tolist()
period_list = df1['period'].unique().tolist()

In [None]:
# Plotting figure 1
@interact(Period = period_list)
def plot_fig_1(Period):
  df = tb_1a[['question', Period]].copy()
  df = df.assign(perc = round(df[Period]/tb_1t[Period][0], 2))
  df = df.sort_values(by='perc', ascending=False)
  fig_1 = px.bar(df, x='perc', y='question', title=f'{Period}',
                       text_auto='1%', width=400, height=500)
  fig_1.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, yaxis=dict(autorange="reversed"), title_x=0.5, xaxis_tickformat='1%')
  fig_1.update_traces(marker_color='#ecc541')
  fig_1.show()

In [None]:
# Plotting figure 2
@interact(Period = period_list)
def plot_fig_2(Period):
  df = tb_1b[['question', Period]].copy()
  df = df.assign(perc = round(df[Period]/tb_1t[Period][0], 2),
                 kontaktmodus = Period)
  df = df.sort_values(by='perc', ascending=False)
  colors_fig_2 = {'Vi ble kontaktet av en selger': '#ecc541', 'Vi tok initiativ selv': '#929291'}
  fig_2 = px.bar(df, x='kontaktmodus', y='perc', color='question', title=f'{Period}', barmode='stack', 
                       text_auto='1%', width=400, height=500, color_discrete_map=colors_fig_2)
  fig_2.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, yaxis_tickformat='1%')
  fig_2.show()

In [None]:
# Plotting figure 3
def plot_fig_3():
  cat_ord_3 = {'question': ['Talekvalitet/Dekning', 'Bedre_tilbud', 'Surfekvalitet', 'Annet', 'Kundeservice',
                             'Streamingkvalitet', 'Utlandstjenesten', 'Privat_abo', 'Tvangsavvikling']}
  colors_fig_3 = {'Dekning':'#ecc541', 'Bedre_tilbud':'#929291', 'Data/surfekvalitet':'#0080ff', 'Annet':'#be3e84', 'Kundeservice':'#987d27',
               'Utlandstjenesten':'#444e60', 'Privat_abo':'#00ffff', 'Tvangsavvikling':'#1c1915'}
  fig_3 = px.bar(tb_1a_m, x='period', y='perc', color='question', category_orders=cat_ord_3, 
                  text_auto='1%', title=f'Avgangsundersøkelse over tid', width=800, height=500)#,color_discrete_map=colors_fig_3)
  fig_3.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, yaxis_tickformat = '1%')
  fig_3.show()

plot_fig_3()

In [None]:
# Plotting figure 4
@interact(Period = period_list)
def plot_fig_4(Period):
  df = tb_1a_m[tb_1a_m['period']==Period][['period', 'dekning', 'perc']].copy()
  df = df.groupby(by=['period','dekning']).agg(perc_total = ('perc','sum')).reset_index()
  #return df
  colors_fig_4 = {'Dekning': '#ecc541', 'Annet': '#929291'}
  fig_4 = px.bar(df, x='period', y='perc_total', color='dekning', title=f'{Period}', barmode='stack', 
                       text_auto='1%', width=400, height=500, color_discrete_map=colors_fig_4)
  fig_4.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, yaxis_tickformat='1%')
  fig_4.show()

In [None]:
# Plotting figure 5
def plot_fig_5():
  colors_5 = {'Vi ble kontaktet av en selger': '#ecc541', 'Vi tok initiativ selv': '#929291'}
  fig_5 = px.bar(tb_1b_m, x='period', y='perc', color='question', title=f'Kontakttype over tid',
                  text_auto='1%', width=800, height=500, color_discrete_map=colors_5)
  fig_5.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, yaxis_tickformat = '1%')
  fig_5.show()

plot_fig_5()

In [None]:
# Plotting figure 6
def plot_fig_6():
  colors_6 = {'Dekning': '#ecc541', 'Annet': '#929291'}
  df = tb_1a_m.copy().groupby(by=['period', 'dekning']).agg(dekning_samlet = ('perc', 'sum')).reset_index()
  fig_6 = px.bar(df, x='period', y='dekning_samlet', color='dekning', title='Dekning kontra andre årsaker over tid', barmode='group',
                   text_auto='1%', width=800, height=500, color_discrete_map=colors_6)
  fig_6.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, yaxis_tickformat = '1%')
  #fig_6.update_yaxes(range=[0.0, 1])
  fig_6.show()

plot_fig_6()

In [None]:
# Plotting figure 7
def plot_fig_7():
  df = df1[['year','answereddatetime', 'month']].copy()
  df = df.groupby(by=['answereddatetime']).agg(responses = ('year', 'count')).reset_index()
  df = df.rename(columns={'answereddatetime': 'date'})
  fig_7 = px.histogram(df, x='date', y='responses', text_auto=True,
                        nbins=18, title=f'Antall svar over tid', width=800, height=500)
  fig_7.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, bargap=0.05)
  fig_7.update_traces(marker_color='#ecc541')
  fig_7.show()

plot_fig_7()

In [None]:
# Plotting figure 8
@interact(Period = period_list)
def plot_fig_8(Period):
  df = df1[df1['period']==Period][['year','period','day']].copy()
  df = df.groupby(by=['year', 'day']).agg(responses = ('year', 'count')).reset_index()
  df = df.assign(day = df['day'].str[:3])
  cat_ord_8 = {'day': ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']}
  fig_8 = px.histogram(df, x='day', y='responses', text_auto=True, category_orders=cat_ord_8,
                        title=f'{Period} - Svarer etter ukedager', width=400, height=500)
  fig_8.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, bargap=0.05)
  fig_8.update_traces(marker_color='#ecc541')
  fig_8.show()

In [None]:
# Plotting figure 9
@interact(Period = period_list)
def plot_fig_9(Period):
  df = df1[df1['period']==Period][['year', 'period','hour']].copy()
  df = df.groupby(by=['year', 'hour']).agg(responses = ('year', 'count')).reset_index()
  df = df.assign(perc = round(df['responses']/sum(df['responses']), 3))
  fig_9 = px.histogram(df, x='hour', y='perc', text_auto='1%', nbins=12,
                        title=f'{Period} - Svarer etter timer', width=600, height=500)
  fig_9.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, bargap=0.05)
  fig_9.update_traces(marker_color='#ecc541')
  fig_9.update_yaxes(tickformat='1%')
  fig_9.show()

In [None]:
# Plotting figure 10
def plot_fig_10():
  df = df1_e.groupby(by=['category']).agg(perc_total = ('perc', 'sum')).reset_index().copy()
  period = df1_e['period'][0]
  colors_10 = {'dekning':'#ecc541', 'pris/tilbud':'#b2b2b2', 'kundeservice':'#ff6973', 'annet':'#ff8700'}
  fig_10 = px.bar(df, x='category', y='perc_total', color='category', title=f'{period} Hva gjorde at dere valgte å forlate oss akkurat nå?', 
                       text_auto='1%', width=400, height=500, color_discrete_map=colors_10)
  fig_10.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=12, title_x=0.5, yaxis_tickformat='1%')
  fig_10.show()

plot_fig_10()

In [None]:
# Plotting figure 11
x_11 = ['dekning', 'pris/tilbud', 'kundeservice', 'annet']
y_11 = [0.64,0.21,0.04,0.10]
df_11 = pd.DataFrame(list(zip(x_11, y_11)), columns=['category', 'perc'])

def plot_fig_11():
  colors_11 = {'dekning':'#ecc541', 'pris/tilbud':'#b2b2b2', 'kundeservice':'#ff6973', 'annet':'#ff8700'}
  fig_11 = px.bar(df_11, x='category', y='perc', color='category',
                  #title='2022-Q3 Hva gjorde at dere valgte den nye operatøren?',
                  title='2022-Q3 Hva gjorde at dere valgte å forlate oss akkurat nå?',
                  text_auto='1%', width=400, height=500, color_discrete_map=colors_11)
  fig_11.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=12, title_x=0.5, yaxis_tickformat='1%')
  fig_11.show()

plot_fig_11()

In [None]:
# Plotting figure 18
def plot_fig_18():
  cat_ord_18 = {'question': ['Dekning', 'Bedre_tilbud', 'Data/surfekvalitet', 'Annet', 'Kundeservice',
                             'Utlandstjenesten', 'Privat_abo', 'Tvangsavvikling']}
  fig_18 = px.area(tb_1a_m, x='period', y='perc', facet_col='question', color='question', facet_col_wrap=2, category_orders=cat_ord_18,
                  text='perc', facet_col_spacing=0.1, title=f'Trend etter årsak', width=900, height=500)
  fig_18.update_layout(title_font_size=16, title_x=0.5)
  fig_18.update_traces(textfont_size=10, textposition='top left', texttemplate='%{y:1%}')
  fig_18.update_yaxes(tickformat='1%', tickfont = dict(size=10))
  fig_18.update_xaxes(tickfont = dict(size=10))
  fig_18.show()

plot_fig_18()