## More questions:
> 1- Work challenges

In [3]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns
%matplotlib inline

In [4]:
df = pd.read_csv('survey_results_public_2019.csv', encoding =  "ISO-8859-1", low_memory=False)

In [7]:
df = df[df['WorkRemote'].notna()]

In [8]:
df['WorkRemote'].value_counts()

Less than once per month / Never                           30220
A few days each month                                      17242
All or almost all the time (I'm full-time remote)           8465
Less than half the time, but at least one day each week     6320
It's complicated                                            3675
More than half, but not all, the time                       2376
About half the time                                         1986
Name: WorkRemote, dtype: int64

#### 1- Work Challenges

In [23]:
df[df['WorkRemote'] == "All or almost all the time (I'm full-time remote)"].groupby('WorkChallenge').count()['Respondent'].reset_index().sort_values('Respondent', ascending = False)

Unnamed: 0,WorkChallenge,Respondent
4,Being tasked with non-development work;Distrac...,967
6,Being tasked with non-development work;Distrac...,732
24,Being tasked with non-development work;Meeting...,700
53,Distracting work environment;Meetings;Not enou...,594
37,Distracting work environment,591
...,...,...
127,Time spent commuting;Toxic work environment,42
108,"Meetings;Non-work commitments (parenting, scho...",42
90,Lack of support from management;Meetings;Non-w...,37
81,Inadequate access to necessary tools;Non-work ...,36


In [40]:
from collections import defaultdict
def total_count(df, col1, col2, look_for):
    '''
    INPUT:
    df - the pandas dataframe you want to search
    col1 - the column name you want to look through
    col2 - the column you want to count values from
    look_for - a list of strings you want to search for in each row of df[col]
    
    OUTPUT:
    new_df - a dataframe of each look_for with the count of how often it shows up 
    '''
    new_df = defaultdict(int)
    for val in look_for:
        for idx in range(df.shape[0]):
            if val in df[col1][idx]:
                new_df[val] += int(df[col2][idx])   
    new_df = pd.DataFrame(pd.Series(new_df)).reset_index()
    new_df.columns = [col1, col2]
    new_df.sort_values('count', ascending=False, inplace=True)
    return new_df


remote_options = ['Less than once per month / Never', 'A few days each month', "All or almost all the time (I'm full-time remote)"
                  ,'Less than half the time, but at least one day each week',"It's complicated",'More than half, but not all, the time','About half the time']   

for r in remote_options:
    print()
    print()
    print(r)
    
    challenge  = df[df['WorkRemote']==r]['WorkChallenge'].value_counts().reset_index()
    challenge.rename(columns={'index':'method', 'WorkChallenge':'count'}, inplace=True)
    possible_values = set()
    challenge['method'].apply(lambda x : [possible_values.add(item) for item in x.split(';')])

    print(total_count(challenge, 'method', 'count', possible_values))



Less than once per month / Never
                                              method  count
2                       Distracting work environment  12352
4             Being tasked with non-development work  11021
5                 Not enough people for the workload   9830
8                                           Meetings   9771
0                    Lack of support from management   8298
7               Inadequate access to necessary tools   6898
6                             Toxic work environment   6304
3                               Time spent commuting   5723
1  Non-work commitments (parenting, school work, ...   4518


A few days each month
                                              method  count
2                       Distracting work environment   7291
8                                           Meetings   6925
4             Being tasked with non-development work   6143
5                 Not enough people for the workload   5780
0                    Lack of support from

#### 2- Operating system

In [59]:
remote_options = ['Less than once per month / Never', 'A few days each month', "All or almost all the time (I'm full-time remote)"
                  ,'Less than half the time, but at least one day each week',"It's complicated",'More than half, but not all, the time','About half the time']   


challenge  = df[df['WorkRemote']=="All or almost all the time (I'm full-time remote)"]['OpSys'].value_counts().reset_index()
challenge.rename(columns={'index':'method', 'OpSys':'count'}, inplace=True)
possible_values = set()
challenge['method'].apply(lambda x : [possible_values.add(item) for item in x.split(';')])
print(total_count(challenge, 'method', 'count', possible_values))

challenge  = df[df['WorkRemote']!="All or almost all the time (I'm full-time remote)"]['OpSys'].value_counts().reset_index()
challenge.rename(columns={'index':'method', 'OpSys':'count'}, inplace=True)
possible_values = set()
challenge['method'].apply(lambda x : [possible_values.add(item) for item in x.split(';')])
print(total_count(challenge, 'method', 'count', possible_values))

challenge  = df['OpSys'].value_counts().reset_index()
challenge.rename(columns={'index':'method', 'OpSys':'count'}, inplace=True)
possible_values = set()
challenge['method'].apply(lambda x : [possible_values.add(item) for item in x.split(';')])
print(total_count(challenge, 'method', 'count', possible_values))

        method  count
2        MacOS   3158
3      Windows   2944
1  Linux-based   2279
0          BSD     16
        method  count
3      Windows  28650
2        MacOS  17469
1  Linux-based  15248
0          BSD     64
        method  count
3      Windows  31594
2        MacOS  20627
1  Linux-based  17527
0          BSD     80


#### 3- Country

In [68]:
df.groupby(['Country', 'WorkRemote']).agg({'Respondent': 'count'}).groupby(level=0).apply(lambda x:100 * x / float(x.sum())).sort_values('Respondent')

Unnamed: 0_level_0,Unnamed: 1_level_0,Respondent
Country,WorkRemote,Unnamed: 2_level_1
South Africa,"More than half, but not all, the time",0.772201
Slovenia,About half the time,0.806452
Malaysia,About half the time,0.947867
South Korea,"More than half, but not all, the time",0.952381
Uruguay,"More than half, but not all, the time",1.010101
...,...,...
Bahamas,All or almost all the time (I'm full-time remote),100.000000
Djibouti,Less than once per month / Never,100.000000
Lesotho,Less than once per month / Never,100.000000
Angola,It's complicated,100.000000


In [19]:
df[df['WorkRemote'] == "All or almost all the time (I'm full-time remote)"].groupby('Country').count()['Respondent'].reset_index().sort_values('Respondent', ascending = False)

Unnamed: 0,Country,Respondent
127,United States,2778
51,India,442
125,United Kingdom,437
21,Canada,364
42,Germany,326
...,...,...
13,Benin,1
9,Bahamas,1
8,Azerbaijan,1
3,Antigua and Barbuda,1


#### Company size

In [69]:
df['OrgSize'].value_counts()

20 to 99 employees                                    14670
100 to 499 employees                                  12338
10,000 or more employees                               9657
1,000 to 4,999 employees                               7210
2-9 employees                                          7040
10 to 19 employees                                     6505
500 to 999 employees                                   4357
Just me - I am a freelancer, sole proprietor, etc.     3742
5,000 to 9,999 employees                               2859
Name: OrgSize, dtype: int64

In [71]:
df[df['WorkRemote'] == "All or almost all the time (I'm full-time remote)"]['OrgSize'].value_counts()

Just me - I am a freelancer, sole proprietor, etc.    1968
2-9 employees                                         1670
20 to 99 employees                                    1383
100 to 499 employees                                   971
10 to 19 employees                                     878
10,000 or more employees                               499
1,000 to 4,999 employees                               417
500 to 999 employees                                   315
5,000 to 9,999 employees                               145
Name: OrgSize, dtype: int64

In [72]:
df[df['WorkRemote'] == "Less than once per month / Never"]['OrgSize'].value_counts()

20 to 99 employees                                    7011
100 to 499 employees                                  5961
10,000 or more employees                              4010
1,000 to 4,999 employees                              3416
10 to 19 employees                                    2904
2-9 employees                                         2346
500 to 999 employees                                  2041
5,000 to 9,999 employees                              1304
Just me - I am a freelancer, sole proprietor, etc.     376
Name: OrgSize, dtype: int64

#### Social media

In [20]:
df[df['WorkRemote'] == "All or almost all the time (I'm full-time remote)"].groupby('SocialMedia').count()['Respondent'].reset_index().sort_values('Respondent', ascending = False)

Unnamed: 0,SocialMedia,Respondent
7,Twitter,1653
0,Facebook,1279
5,Reddit,1249
12,YouTube,1132
11,WhatsApp,987
2,I don't use social media,689
3,Instagram,537
4,LinkedIn,426
8,VK ÐÐÐ¾Ð½ÑÐ°ÌÐºÑÐµ,76
6,Snapchat,34


In [24]:
df[df['WorkRemote'] == "Less than once per month / Never"].groupby('SocialMedia').count()['Respondent'].reset_index().sort_values('Respondent', ascending = False)

Unnamed: 0,SocialMedia,Respondent
5,Reddit,5216
11,WhatsApp,4938
0,Facebook,4672
12,YouTube,4475
7,Twitter,3586
3,Instagram,2133
2,I don't use social media,1803
4,LinkedIn,1504
9,WeChat å¾®ä¿¡,300
8,VK ÐÐÐ¾Ð½ÑÐ°ÌÐºÑÐµ,192


### Workhour per week

In [73]:
df['WorkWeekHrs'].value_counts()

40.00    30061
45.00     6322
50.00     4639
35.00     3152
37.50     1660
         ...  
46.50        1
41.20        1
8.15         1
7.22         1
35.75        1
Name: WorkWeekHrs, Length: 206, dtype: int64

In [76]:
df[df['WorkRemote'] == "Less than once per month / Never"]['WorkWeekHrs'].value_counts()

40.0     13714
45.0      2952
50.0      1638
35.0      1158
37.5       930
         ...  
36.2         1
46.5         1
73.0         1
21.5         1
130.0        1
Name: WorkWeekHrs, Length: 166, dtype: int64

In [77]:
df[df['WorkRemote'] == "All or almost all the time (I'm full-time remote)"]['WorkWeekHrs'].value_counts()

40.00     3036
50.00      695
45.00      521
30.00      494
35.00      445
          ... 
19.00        1
110.00       1
88.33        1
63.00        1
62.50        1
Name: WorkWeekHrs, Length: 103, dtype: int64