# EDAV Project - Data Exploration

In [1]:
import pandas as pd

df = pd.read_csv('responses_clean.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

## Column names

In [2]:
list(df)

['Are you on the waiting list?',
 'Program',
 'Programming and Analytical Experiences [R, data manipulation and modeling]',
 'What code/text editor do you use most?',
 'Programming and Analytical Experiences [R, graphic basics (base, lattice, grid etc. )]',
 'Programming and Analytical Experiences [R, advanced (multivariate data analysis, e.g. spatiotemporal data, visualization and modeling)]',
 'Programming and Analytical Experiences [Reproducible documentation with R (e.g. R Markdown)]',
 'Programming and Analytical Experiences [Matlab, data manipulation, analysis, visualization and modeling]',
 'Programming and Analytical Experiences [Github]',
 'R',
 'Excel',
 'SQL',
 'RStudio',
 'ggplot2',
 'Python',
 'Stata',
 'dropbox',
 'google drive (formerly docs)',
 'regular expressions (grep)',
 'Github',
 'shell (terminal / command line)',
 'LaTeX',
 'Sweave/knitr',
 'XML',
 'Web: html css js',
 'C/C++',
 'Matlab',
 'SPSS',
 'lattice']

## Number of responses for each program

In [3]:
df['Program'].astype('category').value_counts()

IDSE (master)                 56
Data Science Certification    22
Statistics (master)           17
Other masters                 11
QMSS (master)                  3
Ph.D.                          2
PhD Biomedical Informatics     1
Data Science                   1
Applied Math                   1
dtype: int64

## Mean self-reported confidence level for each program

In [4]:
skill_level = df.iloc[:,[1,2,4,5,6,7,8]]
skill_level.groupby('Program').mean().round(3)

Unnamed: 0_level_0,"Programming and Analytical Experiences [R, data manipulation and modeling]","Programming and Analytical Experiences [R, graphic basics (base, lattice, grid etc. )]","Programming and Analytical Experiences [R, advanced (multivariate data analysis, e.g. spatiotemporal data, visualization and modeling)]",Programming and Analytical Experiences [Reproducible documentation with R (e.g. R Markdown)],"Programming and Analytical Experiences [Matlab, data manipulation, analysis, visualization and modeling]",Programming and Analytical Experiences [Github]
Program,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Applied Math,0.0,0.0,0.0,1.0,0.0,2.0
Data Science,2.0,2.0,1.0,3.0,1.0,2.0
Data Science Certification,1.227,0.682,0.5,0.591,0.636,1.182
IDSE (master),1.661,1.143,0.982,1.107,0.929,0.911
Other masters,1.909,1.455,1.091,1.364,0.818,1.0
Ph.D.,1.5,1.5,0.5,0.0,0.5,1.0
PhD Biomedical Informatics,2.0,1.0,1.0,0.0,0.0,1.0
QMSS (master),1.667,1.333,1.333,1.667,1.0,1.0
Statistics (master),1.941,1.294,1.294,0.588,0.882,0.882


## Proportion of students reporting being familiar with each tool for each program

In [5]:
skills = pd.concat([df.iloc[:,1], df.iloc[:,9:]], axis=1)
skills.groupby('Program').mean().round(3)

Unnamed: 0_level_0,R,Excel,SQL,RStudio,ggplot2,Python,Stata,dropbox,google drive (formerly docs),regular expressions (grep),Github,shell (terminal / command line),LaTeX,Sweave/knitr,XML,Web: html css js,C/C++,Matlab,SPSS,lattice
Program,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Applied Math,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Data Science,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Data Science Certification,0.591,0.864,0.591,0.545,0.136,0.727,0.136,0.636,0.818,0.227,0.409,0.5,0.091,0.045,0.227,0.273,0.182,0.182,0.0,0.0
IDSE (master),0.821,0.661,0.554,0.804,0.268,0.893,0.143,0.607,0.732,0.125,0.375,0.393,0.375,0.071,0.125,0.25,0.339,0.375,0.143,0.018
Other masters,0.818,0.818,0.455,0.818,0.364,0.636,0.182,0.545,0.455,0.273,0.455,0.273,0.182,0.182,0.0,0.182,0.273,0.455,0.182,0.091
Ph.D.,1.0,0.5,0.5,0.5,0.5,0.5,0.0,1.0,0.5,0.0,0.5,0.5,0.5,0.0,0.0,0.5,0.0,0.5,0.0,0.0
PhD Biomedical Informatics,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
QMSS (master),1.0,1.0,0.667,1.0,0.667,1.0,0.667,0.667,0.667,0.333,0.333,0.333,0.667,0.667,0.333,0.333,1.0,0.333,0.333,0.333
Statistics (master),0.882,0.529,0.176,0.882,0.353,0.353,0.118,0.412,0.412,0.0,0.294,0.0,0.235,0.0,0.059,0.059,0.353,0.412,0.176,0.0
