# Functions for survey data

### Package and data import 

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

df = pd.read_excel("cleaned_tech_test_data.xlsx")

### Cleaning

##### dropping redundant columns

In [4]:
df = df.iloc[:,19:]

##### making columns binary/catagorical 

In [5]:
def make_columns_binary(df):
    cols = df.columns
    for x in cols:
        df[x] = df[x].replace('.*', 1, regex=True)
    
    return df

##### remove NA's

In [6]:
df = df.fillna(0)

##### catagorising variables

In [7]:
def age_bracket(age):
    
    age = int(age)
    
    if age < 30:
        group = '<30'
    
    if age in range(30, 40):
        group = '30-39'
        
    if age in range(40, 50):
        group = '40-49'
        
    if age in range(50, 60):
        group = '50-59'
   
    if age >= 60:
        group = '60+'

    return group

#can be adjusted easily  for any variable.

##### cutting catagorical into even groups

In [21]:
groups = pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, labels=["low", "medium", "high"])
groups = pd.DataFrame(groups)

Unnamed: 0,0
0,low
1,high
2,medium
3,medium
4,high
5,low


### Exploration

##### piping into subgroups

In [None]:
groupings = (
     
    df #start with your df
    #. - If wanted, you could add a query in here.
    #. - You could also add a sort in here. 
    .iloc[:,:2] # select the first two columns (You can still groupby a column that you havent selected here.)
    .groupby(df.iloc[:,11]) #select your cols using iloc. The minus sign works backwards from the last col.
    .count()
        
)

groupings

##### Crosstabs and heatmaps for one hot encoded questions

In [None]:
def make_crosstab_percentage(df, groupby_col, independant_cols):
    cols  = [groupby_col.name] + list(independant_cols.columns)
    groupings = (
     
        df[cols]
        .groupby(groupby_col) 
        .count()

    )
    
    for x in independant_cols:
        groupings['{} (%)'.format(x)] = groupings[x]/groupings.iloc[:,0]*100
        groupings = groupings.drop(columns = [x])


    return groupings

#Plot crosstabs
#inputs are the dataframe, the column you want to group by and the list of columns you want to examine. 
#this works specifically on one hot encoded columns.

crosstab = make_crosstab_percentage(df, main_demographics.iloc[:,5], yearly_changes)

#delete first column to formulate heatmap correctly.
crosstab = crosstab.iloc[:,1:]

fig, ax = plt.subplots(figsize=(10,10)) 
sns.heatmap(crosstab, cmap="Reds", annot=True, cbar=False, fmt='g')
ax.set_title("Question-title")

In [None]:
##### Crosstabs and heatmaps for multi choice questions

In [12]:
#This function takes a variable (var) and crosstabs it against a survey question (question) and outputs a heatmap
#works specificall on multiple choice questions where all answers are in one column.

def heatmap_multi_choice(var, question, question_title):
    
    crosstab = round(pd.crosstab(var, question, normalize = "index")*100, 2) #percentages
    
    fig, ax = plt.subplots(figsize=(10,10)) 
    sns.heatmap(crosstab, cmap="Reds", annot=True, cbar=False, fmt='g')
    ax.set_title(question_title)
    
    return ax, crosstab

heatmap_multi_choice(var, question, "Title")[0] #[0] returns the heatmap, [1] returns the unmodified crosstab.
