In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn3
from matplotlib import gridspec
import seaborn as sns
import plotly.graph_objs as go
import plotly.offline as py
from plotly import tools
py.init_notebook_mode(connected=True)
from wordcloud import WordCloud, STOPWORDS

import warnings
warnings.filterwarnings('ignore')
plt.rcParams.update({'font.size':12,
                    'xtick.labelsize':14,
                    'ytick.labelsize':14})

In [53]:
df2018 = pd.read_csv('../input/kaggle-survey-2018/multipleChoiceResponses.csv',)
response = pd.read_csv('../input/kaggle-survey-2018/freeFormResponses.csv',)
schema = pd.read_csv('../input/kaggle-survey-2018/SurveySchema.csv')
print(df2018.shape)
print(response.shape)
print(schema.shape)
df = df2018[1:]
df.head()

(23860, 395)
(23860, 35)
(12, 52)


Unnamed: 0,Time from Start to Finish (seconds),Q1,Q1_OTHER_TEXT,Q2,Q3,Q4,Q5,Q6,Q6_OTHER_TEXT,Q7,...,Q49_OTHER_TEXT,Q50_Part_1,Q50_Part_2,Q50_Part_3,Q50_Part_4,Q50_Part_5,Q50_Part_6,Q50_Part_7,Q50_Part_8,Q50_OTHER_TEXT
1,710,Female,-1,45-49,United States of America,Doctoral degree,Other,Consultant,-1,Other,...,-1,,,,,,,,,-1
2,434,Male,-1,30-34,Indonesia,Bachelor’s degree,Engineering (non-computer focused),Other,0,Manufacturing/Fabrication,...,-1,,,,,,,,,-1
3,718,Female,-1,30-34,United States of America,Master’s degree,"Computer science (software engineering, etc.)",Data Scientist,-1,I am a student,...,-1,,Too time-consuming,,,,,,,-1
4,621,Male,-1,35-39,United States of America,Master’s degree,"Social sciences (anthropology, psychology, soc...",Not employed,-1,,...,-1,,,Requires too much technical knowledge,,Not enough incentives to share my work,,,,-1
5,731,Male,-1,22-24,India,Master’s degree,Mathematics or statistics,Data Analyst,-1,I am a student,...,-1,,Too time-consuming,,,Not enough incentives to share my work,,,,-1


In [54]:
def Horizontal_bar_plot(df, column, name='',title='',limit=None,colorscale = 'Picnic',width = 900, height = 500):
    tmp = df[column].value_counts()[:limit]
    tmp_per = round(tmp*100/tmp.sum(),2)
    tmp_per = [str(v)+' %' for v in tmp_per]
    trace1 = go.Bar(y = tmp.values, x = tmp.index, name=name, orientation='v',
                   marker=dict(color=tmp.values, colorscale = colorscale, line=dict(color='rgb(127,140,141)',width=2),),
                   text = tmp_per, textposition = 'outside',
                   )
    layout = dict(title = title,
                 width = width, height = height,
                 yaxis = dict(automargin = True,),
                 paper_bgcolor = 'rgb(251, 252, 252)',
                 plot_bgcolor = 'rgb(251, 252, 252)'
                 )
    fig = {'data':[trace1],'layout':layout}
    py.iplot(fig)

In [55]:
def Horizontal_Multi_Barplot(df, column, column_filter,title ='', height = 600, width = 850,
                             name = ['Student','Data Scientist','Data Analyst'],limit = None,):
    """ Bar plot"""
    colors = ['rgb (240,128,128)','rgb(0,255,255)','rgb(186,85,211)','rgb(210,105,30)','rgb(0,0,205)',
              'rgb(124,252,0)','rgb(255,99,71)',]
    # Layout
    fig = tools.make_subplots(rows= len(name), cols=1, #subplot_titles= tuple(name),
                              vertical_spacing = 0.05, horizontal_spacing = 0.05,
                              print_grid= False,shared_xaxes = True)
    
    fig['layout'].update(dict( 
        showlegend =False,
        height = height,
        width = width,
        title = title,
        paper_bgcolor='rgb(251, 252, 252)',
        plot_bgcolor='rgb(250, 250, 255)'))

    # Multi Plot
    for i, c in enumerate(name):
        #tmp = df[column].value_counts()[:limit]
        tmp = df[df[column_filter] == c][column].value_counts()[:limit]
        tmp_per = round(tmp * 100/ tmp.sum() , 2)
        tmp_per = [str(v)+' %' for v in tmp_per]
        
        # Plot
        trace1 = go.Bar(y = tmp.values, x = tmp.index, name= c,orientation='v',
            marker=dict(color = colors[i],line=dict(color='rgb( 127, 140, 141)',width=2),),
            text = tmp_per, textposition='auto', textfont = dict(size =13,family = 'Droid')
        )
        
        fig.append_trace(trace1, i+1,1)
        fig.layout[f'yaxis{i+1}'].update(title = c)

    # Final plot
    py.iplot(fig)

In [56]:
# 4 Pie plot
def Pie_plot_agg(filter_column = '',column = '',name = [], title = '', width = 1000, height= 600):
    
    """Draw four pie plot of
    filter_column is to filter out the perticual category mentioned in name variable
    column: This target column upon which pie plot is drwan
    name: Four different category of filter_column
    """
    # trace1
    tmp = (df[df[filter_column] == name[0]][column])
    tmp = tmp.value_counts(ascending = True)
    trace1 = go.Pie(labels= tmp.index, values= tmp.values, hoverinfo='label+percent+name', 
                    name = name[0],hole= .5, domain= dict(x = [0, 0.46], y = [0.54, 1]))
    
    # trace2
    tmp = (df[df[filter_column] == name[1]][column])
    tmp = tmp.value_counts(ascending = True)
    trace2= go.Pie(labels= tmp.index, values= tmp.values, hoverinfo='label+percent+name', 
                    name = name[1],hole= .5, domain= dict(x = [0.54,1],y = [0.54, 1]))
    #trace3
    tmp = (df[df[filter_column] == name[2]][column])
    tmp = tmp.value_counts(ascending = True)
    trace3 = go.Pie(labels= tmp.index, values= tmp.values, hoverinfo='label+percent+name', 
                    name = name[2],hole= .5, domain= dict(x = [0, 0.46],y = [0, 0.46]))
    #trace4
    tmp = (df[df[filter_column] == name[3]][column])
    tmp = tmp.value_counts(ascending = True)
    trace4 = go.Pie(labels= tmp.index, values= tmp.values, hoverinfo='label+percent+name', 
                    name = name[3],hole= .5, domain= dict(x = [0.54, 1],y = [0, 0.46]))

    # Layout
    layout = go.Layout(title = title, width = width, height = height,
                       annotations = [dict(font = dict(size=20)),
                                      dict(showarrow =False, text=name[0],x = 0.18, y=0.78),
                                      dict(font = dict(size=20)),
                                      dict(showarrow= False, text=name[1],x = 0.82, y=0.78),
                                      dict(font = dict(size=20)),
                                      dict(showarrow= False, text=name[2],x = 0.17, y=0.2),
                                      dict(font = dict(size=20)),
                                      dict(showarrow= False, text=name[3],x = 0.82, y=0.2),
                                ])
    fig = go.Figure(data = [trace1, trace2, trace3, trace4], layout= layout)
    py.iplot(fig)

In [57]:
def Venn2_diagram(df,columns):
    """ Venn diagram of 2 sets"""
    # Subset count
    label = df[columns].mode().values[0]
    subsets = (
        len(df[(df[columns[0]] == label[0]) & (df[columns[1]] != label[1])]), #A
        len(df[(df[columns[0]] != label[0]) & (df[columns[1]] == label[1])]), #B
        len(df[(df[columns[0]] == label[0]) & (df[columns[1]] == label[1])]), #A.B
             )
    return venn2(subsets = subsets, set_labels= label)

def Venn3_diagram(df,columns):
    """ Venn diagram of 3 sets"""
    # Subset count
    label = df[columns].mode().values[0]
    subsets = (
        len(df[(df[columns[0]] == label[0]) & (df[columns[1]] != label[1]) & (df[columns[2]] != label[2])]), #A
        len(df[(df[columns[0]] != label[0]) & (df[columns[1]] == label[1]) & (df[columns[2]] != label[2])]), #B
        len(df[(df[columns[0]] == label[0]) & (df[columns[1]] == label[1]) & (df[columns[2]] != label[2])]), #A.B
        len(df[(df[columns[0]] != label[0]) & (df[columns[1]] != label[1]) & (df[columns[2]] == label[2])]), #C
        len(df[(df[columns[0]] == label[0]) & (df[columns[1]] != label[1]) & (df[columns[2]] == label[2])]), #A.C
        len(df[(df[columns[0]] != label[0]) & (df[columns[1]] == label[1]) & (df[columns[2]] == label[2])]), #B.C
        len(df[(df[columns[0]] == label[0]) & (df[columns[1]] == label[1]) & (df[columns[2]] == label[2])]), #A.B.C
             )
    return venn3(subsets = subsets, set_labels= label)

In [58]:
#
tmp = df['Time from Start to Finish (seconds)'].astype('int')/60
tmp = tmp[tmp<100]
print('Mean time to anwser the quetions is:',round(np.mean(tmp),2), 'minutes')

# Plot
trace1 = go.Histogram(x = tmp, #nbinsx= 30, 
                      marker= dict(color='rgb(255, 65, 54)', line=dict(color='rgb( 127, 140, 141)',width=0.5)))
layout = dict(
        title='Duration in minute',
        width = 800,
        height = 400,
        xaxis = dict(autorange=True),
        yaxis=dict(automargin=True),
        paper_bgcolor='rgb(251, 252, 252)',
        plot_bgcolor='rgb(251, 252, 252)'
        )
fig = {'data':[trace1], 'layout':layout}
py.iplot(fig)

Mean time to anwser the quetions is: 18.72 minutes


In [59]:
def Map(tmp, title = '', colorscale = 'Viridis',):
    """Geo map:"""
    data = dict( type = 'choropleth',
               locations = tmp.index,
               z = tmp.values,
               text = tmp. index,
               locationmode = 'country names',
               colorscale = False,
               reversescale =True,
               marker = dict(line = dict(
                             color = 'rgb(100,180,100)',width = 0.3
               ) ),
               colorbar = dict(autotick = False,
                              title = 'Response'),
               )
    layout = dict(
        title = title,
    geo = dict(showland =True,
              landcolor = "rgb(250, 250, 250)",
          showframe = False,
          showcoastlines =True,
          projection = dict(type = 'Mercator')
    ))
    fig = dict( data = [data], layout = layout )
    py.iplot(fig,validate=False, filename='word-map')

In [60]:
print(df2018['Q3'][0])
tmp = df['Q3'].value_counts()
title = '2018 Kaggle Survey - Response'
Map(tmp, title = title, colorscale='Viridis')

In which country do you currently reside?


In [61]:
print(df2018['Q3'][0])
Horizontal_bar_plot(df, column = 'Q3', title = 'Top 20 Countries Response', limit =20)
df['Q3'].nunique()

In which country do you currently reside?


58

In [62]:
print(df2018['Q6'][0])
title = 'Current Role'
Horizontal_bar_plot(df, column = 'Q6', title=title, colorscale = 'Rainbow')

Select the title most similar to your current role (or most recent title if retired): - Selected Choice


In [63]:
print(df2018['Q3'][0], '\n', df2018['Q6'][0],)
title = 'Country Vs Current Role'
Pie_plot_agg(filter_column='Q3',
            column = 'Q6',
            title = title,
            name = ['United States of America', 
                    'India', 'China', 'Russia'],
            width = 1000, height = 800)

In which country do you currently reside? 
 Select the title most similar to your current role (or most recent title if retired): - Selected Choice


In [64]:
print(df2018['Q7'][0])
title = 'Current Employer'
Horizontal_bar_plot(df, column = 'Q7', title = title)

In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice


In [65]:
title = 'Current Industry'
Horizontal_Multi_Barplot(df, column = 'Q7', column_filter = 'Q6', title = title, limit = None, width =1000)

In [66]:
title = 'Work Experience'
Horizontal_bar_plot(df, column = 'Q8', name = 'Year', title = title, limit = None,)

In [67]:
title = 'Work Experience of Data Professionals'
Horizontal_Multi_Barplot(df, column = 'Q8', column_filter = 'Q6', title = title, limit = None, width = 1000)

In [68]:
title = 'Gender'
Horizontal_Multi_Barplot(df, column= 'Q1', column_filter= 'Q6',title = title,height=500)

In [69]:
title = 'Age'
Horizontal_Multi_Barplot(df, column= 'Q2', column_filter= 'Q6',title = title)

In [70]:
print(df2018['Q5'][0])
title = 'Graduate Major'
Horizontal_bar_plot(df, column= 'Q5', name='Year',title= title, height= 500)

Which best describes your undergraduate major? - Selected Choice


In [71]:
title = 'Graduate Major'
Horizontal_Multi_Barplot(df, column= 'Q5', column_filter= 'Q6',title = title,)

In [72]:
Pie_plot_agg(filter_column= 'Q3',
            column = 'Q5',
             title = 'Graduate major from top 4 countries',
            name = ['United States of America','India', 'China', 'Russia'],
            width = 1100, height =800)

In [76]:
print(df2018['Q4'][0])
title = 'Highest Level of Education'
Horizontal_bar_plot(df, column = 'Q4', name = 'Year', title = title)

What is the highest level of formal education that you have attained or plan to attain within the next 2 years?


In [77]:
title = 'Highest level of Education'
Horizontal_Multi_Barplot(df, column = 'Q4', column_filter = 'Q6', title = title)

In [78]:
print(df2018['Q9'][0])
title = 'Yearly Compensation'
Horizontal_bar_plot(df, column = 'Q9', title = title)

What is your current yearly compensation (approximate $USD)?


In [79]:
title = 'Current Yearly Compensation in Current Role'
Horizontal_Multi_Barplot(df, column= 'Q9', column_filter= 'Q6',title = title)