# EDAV Project - Data Exploration

In [1]:
import pandas as pd

df = pd.read_csv('responses_clean.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

## Check out the column names

In [2]:
list(df)

['Are you on the waiting list?',
 'Program',
 'Programming and Analytical Experiences [R, data manipulation and modeling]',
 'What code/text editor do you use most?',
 'Programming and Analytical Experiences [R, graphic basics (base, lattice, grid etc. )]',
 'Programming and Analytical Experiences [R, advanced (multivariate data analysis, e.g. spatiotemporal data, visualization and modeling)]',
 'Programming and Analytical Experiences [Reproducible documentation with R (e.g. R Markdown)]',
 'Programming and Analytical Experiences [Matlab, data manipulation, analysis, visualization and modeling]',
 'Programming and Analytical Experiences [Github]',
 'R',
 'Excel',
 'SQL',
 'RStudio',
 'ggplot2',
 'Python',
 'Stata',
 'dropbox',
 'google drive (formerly docs)',
 'regular expressions (grep)',
 'Github',
 'shell (terminal / command line)',
 'LaTeX',
 'Sweave/knitr',
 'XML',
 'Web: html css js',
 'C/C++',
 'Matlab',
 'SPSS',
 'lattice']

## Number of responses for each program
Programs other than the top three are either too few or not very specific, so combine them into one "Others" category for analysis below

In [3]:
df['Program'].astype('category').value_counts()

IDSE (master)                 56
Data Science Certification    22
Statistics (master)           17
Other masters                 11
QMSS (master)                  3
Ph.D.                          2
PhD Biomedical Informatics     1
Data Science                   1
Applied Math                   1
dtype: int64

## Mean self-reported confidence level for each program


In [4]:
skill_level = df.iloc[:,[1,2,4,5,6,7,8]]
skill_level = skill_level.groupby('Program').mean()
skill_level.columns = ['R, Data manipulation and modeling', 'R, Graphics', 'R, Advanced', 'RMarkdown', 'Matlab, visualization, etc.', 'Github']

skill_level_major = skill_level.ix[[2,3,8]].round(3)
skill_level_minor = skill_level.ix[[0,1,4,5,6,7]].mean().round(3)
skill_level_minor.name = 'Others'

skill_level = skill_level_major.append(skill_level_minor).transpose()
skill_level.to_json('d3/skill_level.json')
skill_level

Program,Data Science Certification,IDSE (master),Statistics (master),Others
"R, Data manipulation and modeling",1.227,1.661,1.941,1.513
"R, Graphics",0.682,1.143,1.294,1.215
"R, Advanced",0.5,0.982,1.294,0.821
RMarkdown,0.591,1.107,0.588,1.172
"Matlab, visualization, etc.",0.636,0.929,0.882,0.553
Github,1.182,0.911,0.882,1.333


## Proportion of students reporting being familiar with each tool for each program
For use in the heatmap in R

In [5]:
skills = pd.concat([df.iloc[:,1], df.iloc[:,9:]], axis=1)
skills = skills.groupby('Program').mean()
skills_major = skills.ix[[2,3,8]].round(3)
skills_minor = skills.ix[[0,1,4,5,6,7]].mean().round(3)
skills_minor.name = 'Others'

skills = skills_major.append(skills_minor).transpose()
skills.to_csv('heatmaps/skills.csv')
skills

Program,Data Science Certification,IDSE (master),Statistics (master),Others
R,0.591,0.821,0.882,0.803
Excel,0.864,0.661,0.529,0.553
SQL,0.591,0.554,0.176,0.437
RStudio,0.545,0.804,0.882,0.72
ggplot2,0.136,0.268,0.353,0.422
Python,0.727,0.893,0.353,0.856
Stata,0.136,0.143,0.118,0.308
dropbox,0.636,0.607,0.412,0.702
google drive (formerly docs),0.818,0.732,0.412,0.604
regular expressions (grep),0.227,0.125,0.0,0.434
