<a href="https://colab.research.google.com/github/gabrielborja/parc_de_montjuic/blob/main/correlation_matrix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Survey Correlation Matrix

## Importing libraries

In [None]:
# Importing python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Loading Data

In [None]:
# Loading data from local drive
from google.colab import files
uploaded1 = files.upload()

In [None]:
# Storing loaded data to a pandas dataframe
import io
df1 = pd.read_excel((io.BytesIO(uploaded1['ICE_raw_data_B2B_label_0.1_af.xlsx'])), skiprows=1)

In [None]:
# Checking the dataframe
df1.head()

## Data Cleaning

In [None]:
# Set first column as index
df1.set_index('Survey subject ID', inplace=True)

In [None]:
# Filling missing values with zero (0)
df1 = df1.fillna(0)
sum([i for i in df1.isnull().sum()])

0

In [None]:
# Extracting questions to list and creating DataFrame
df1_col = [i for i in df1.columns]
df1_col = pd.DataFrame(df1_col, index=["q"+str(i+1) for i in range(len(df1.columns))], columns=['Questions'])
df1_col

In [None]:
# Replacing string columns with alphanumeric series
df1.columns = ["q"+str(i+1) for i in range(len(df1.columns))]
df1.columns

## Correlation Matrix generation

In [None]:
# Calculate correlation matrix between variables
corr_mx1 = pd.DataFrame(round(df1.corr(), 2))
corr_mx1

In [None]:
# Defining the size of heatmap
fig1, ax1 = plt.subplots(figsize=(20, 20))

# Generating a mask for the upper triangle of the correlation matrix
mask1 = np.triu(np.ones_like(corr_mx1, dtype=bool))

# Generating a custom diverging colormap
cmap1 = sns.diverging_palette(220, 20, as_cmap=True)# l=0, s=100)

# Plotting seaborn heatmap
sns.heatmap(data=corr_mx1, mask=mask1, cmap=cmap1, vmin=-1.0, vmax=1.0, square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

## Exporting results to local drive

In [None]:
# Exporting files to excel
# from google.colab import files
with pd.ExcelWriter('b2b_corr_mx.xlsx', engine='openpyxl') as writer:
  df1_col.to_excel(writer, sheet_name='1_questions', index=True)
  corr_mx1.to_excel(writer, sheet_name='2_b2b_corr', index=True)
files.download('b2b_corr_mx.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Survey data preparation

## Importing libraries

In [1]:
# Importing python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Loading Data

In [None]:
# Loading data from local drive
from google.colab import files
uploaded2 = files.upload()

In [3]:
# Storing loaded data to a pandas dataframe
import io
df2 = pd.read_excel((io.BytesIO(uploaded2['raw_data_label_0.1_af.xlsx'])), skiprows=1)

In [None]:
# Checking the dataframe
df2.head(2)

## Data cleaning

In [5]:
# Filling missing values with zero (0)
df2 = df2.fillna(0)
sum([i for i in df2.isnull().sum()])

0

In [5]:
# Extracting questions to list and creating DataFrame
df2_col = [i for i in df2.columns]
df2_col = pd.DataFrame(df2_col, index=["q"+str(i+1) for i in range(len(df2.columns))], columns=['question'])
df2_col.tail(2)

Unnamed: 0,question
q74,"Utstyr (mobiltelefon, sim-kort eller lignende)..."
q75,Vurderer dere å bytte mobilselskap det nærmest...


In [None]:
# Replacing string columns with alphanumeric series
df2.columns = ["q"+str(i+1) for i in range(len(df2.columns))]
df2 = df2.rename(columns={'q1': df2_col.loc['q1']['question']})
df2.columns

In [33]:
df2_col.loc['q1']['question']

'Survey subject ID'

## Melting DataFrame

In [7]:
# Melting DataFrame to long format
df2_melted = pd.melt(df2, id_vars=df2.columns[0],
                     value_vars=[i for i in df2.columns[1:].to_list()],
                     var_name='question', value_name='answer')

In [8]:
# Reasigning column order and checking melted DataFrame
df2_melted = df2_melted[['question', 'Survey subject ID', 'answer']]
df2_melted.tail()

Unnamed: 0,question,Survey subject ID,answer
14573,q75,3419,2.0
14574,q75,3421,2.0
14575,q75,3453,2.0
14576,q75,3584,2.0
14577,q75,3592,2.0


## Exporting results to local drive

In [19]:
# Exporting files to excel
with pd.ExcelWriter('melted_survey.xlsx', engine='openpyxl') as writer:
  df2_melted.to_excel(writer, sheet_name='01_melted_survey', index=False)
  df2_col.to_excel(writer, sheet_name='02_questions', index=True)
files.download('melted_survey.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Data Visualization

In [9]:
# Importing libraries
import plotly.express as px
from ipywidgets import interact

In [10]:
# Extracting questions for analysis
df2_questions = df2_melted['question'].unique()

In [None]:
# Plotting histogram of survey data with matplotlib
df2_melted[df2_melted['question'] == 'q2']['answer'].plot(kind='hist')

In [11]:
# Plotting histogram of survey data with plotly express
@interact(Questions = df2_questions)
def plot_melted_survey(Questions):
  df = df2_melted[df2_melted['question']==Questions].copy()
  fig_2a = px.histogram(df, x='answer', title=f'{Questions}: {df2_col.loc[Questions]["question"]}',
                        text_auto=True, width=800, height=500)
  fig_2a.update_layout(title_font_size=12)
  fig_2a.show()

interactive(children=(Dropdown(description='Questions', options=('q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9…