In [1]:
import json
import pandas as pd
import numpy as np
from collections import Counter
%matplotlib inline

Get freelancer profile by key

Endpoint
GET /api/profiles/v1/providers/{profile_key}.{format}

In [2]:
def json_read(in_file):
    # read the entire file into a python array
    with open(in_file, 'rb') as f:
        data = f.readlines()

    # remove the trailing "\n" from each line
    data = map(lambda x: x.rstrip(), data)

    # each element of 'data' is an individual JSON object.
    # i want to convert it into an *array* of JSON objects
    # which, in and of itself, is one large JSON object
    # basically... add square brackets to the beginning
    # and end, and have all the individual business JSON objects
    # separated by a comma
    data_json_str = "[" + ','.join(data) + "]"

    # now, load it into pandas
    out_df = pd.read_json(data_json_str)
    return out_df

## Data Science

In [4]:
ds_profiles = json_read('../../data/skills_id/data_scientist.txt')

In [5]:
ds_profiles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3549 entries, 0 to 3548
Data columns (total 16 columns):
categories2              3494 non-null object
country                  3549 non-null object
description              3536 non-null object
feedback                 3549 non-null float64
groups                   25 non-null object
id                       3549 non-null object
last_activity            3549 non-null object
member_since             3549 non-null object
name                     3549 non-null object
portfolio_items_count    3549 non-null int64
portrait_50              3294 non-null object
profile_type             3549 non-null object
rate                     3549 non-null float64
skills                   2828 non-null object
test_passed_count        3549 non-null int64
title                    3521 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 471.4+ KB


In [6]:
ds_profiles['title'].value_counts()

Data Scientist                                                       635
Data scientist                                                       101
Computer Scientist                                                    47
Research Scientist                                                    30
Scientist                                                             27
Senior Data Scientist                                                 22
data scientist                                                        18
Data Analyst                                                          16
Software Engineer                                                     15
Environmental Scientist                                               15
Junior Data Scientist                                                 11
computer scientist                                                    11
Freelance Data Scientist                                               9
Computer scientist                                 

In [7]:
def data_scientist_in(str):
    # Some people may not have titles in which case their titles are NaNs
    if isinstance(str, float):
        return False
    if 'data' in str.lower() and 'scientist' in str.lower():
        return True
    else: return False

In [8]:
vect_dsi = np.vectorize(data_scientist_in)
ds_profiles['ds_flag'] = vect_dsi(np.array(ds_profiles[u'title']))

In [13]:
ds_profiles[ds_profiles['ds_flag'] == True][u'title'].value_counts()

Data Scientist                                                            635
Data scientist                                                            101
Senior Data Scientist                                                      22
data scientist                                                             18
Junior Data Scientist                                                      11
Freelance Data Scientist                                                    9
Statistician/Data Scientist                                                 8
Expert Data Scientist                                                       6
Aspiring Data Scientist                                                     6
Data Scientist/Analyst                                                      5
Lead Data Scientist                                                         4
Experienced Data Scientist                                                  4
Web Developer and Data Scientist                                

In [14]:
ds_profiles[ds_profiles['ds_flag'] == True][u'skills'][0]

[u'elasticsearch',
 u'full-text-search-engines',
 u'data-science',
 u'machine-learning',
 u'natural-language-processing',
 u'lucene-search']

In [15]:
skills_of_ds = ds_profiles[ds_profiles['ds_flag'] == True][u'skills']

In [16]:
ds_skill_words = Counter()
for index, skill_set in enumerate(skills_of_ds):
    if isinstance(skill_set, float):
        print index
        pass
    else: ds_skill_words += Counter(skill_set)

3
28
29
31
42
45
46
49
52
58
66
68
69
70
71
76
77
78
113
120
128
130
145
147
148
149
150
151
155
156
157
179
190
191
192
194
207
208
217
219
223
224
229
248
249
253
258
259
264
267
271
307
335
359
360
363
371
372
373
374
379
380
386
429
430
436
447
449
459
465
466
467
469
470
474
475
477
479
482
503
510
514
516
521
526
528
540
541
561
564
569
592
599
600
601
604
605
606
609
610
613
614
621
630
634
636
639
641
642
643
644
645
646
649
655
656
659
660
668
670
671
672
674
689
699
706
727
729
731
732
744
751
752
755
756
761
767
768
769
772
780
787
788
789
798
805
814
816
818
819
820
822
824
849
856
875
879
880
884
886
888
889
890
895
899
901
909
910
912
915
922
926
928
938
939
940
941
950
957
958
959
962
970
971
972
973
990
994
995
998
1004
1009
1016
1020
1021
1023
1038
1050
1051
1056
1058
1059
1061
1062
1068
1070
1077
1078
1079
1080
1081
1082
1084
1088
1090
1092
1097
1101
1107
1108
1118
1120
1122
1123
1130
1132
1134
1145
1151
1152
1153
1172
1178
1184
1185
1190
1194
1199
1210
1219
1229
1231

In [17]:
ds_profiles[ds_profiles['ds_flag'] == True][u'skills'][3]

nan

In [18]:
ds_profiles[ds_profiles['ds_flag'] == True].loc[3]

categories2                                                            NaN
country                                                      United States
description              I have seven years of experience using data to...
feedback                                                                 0
groups                                                                 NaN
id                                                     ~012e0c5b631c1fea24
last_activity                                               April 10, 2016
member_since                                             February 21, 2011
name                                                        Christopher F.
portfolio_items_count                                                    1
portrait_50              https://odesk-prod-portraits.s3.amazonaws.com/...
profile_type                                                   Independent
rate                                                                   159
skills                   

In [19]:
ds_skill_words

Counter({u'.net-framework': 9,
         u'3d-design': 1,
         u'ab-testing': 11,
         u'academic-writing': 12,
         u'accounting': 3,
         u'ad-posting': 1,
         u'adaptive-algorithms': 1,
         u'administrative-support': 4,
         u'adobe-acrobat': 1,
         u'adobe-after-effects': 1,
         u'adobe-analytics': 4,
         u'adobe-business-catalyst': 2,
         u'adobe-creative-suite': 2,
         u'adobe-digital-marketing-suite': 1,
         u'adobe-dreamweaver': 1,
         u'adobe-illustrator': 4,
         u'adobe-indesign': 2,
         u'adobe-photoshop': 10,
         u'adobe-photoshop-lightroom': 1,
         u'adobe-premiere-pro': 1,
         u'advertising': 1,
         u'agile-software-development': 9,
         u'aix': 1,
         u'ajax': 2,
         u'akka': 1,
         u'algorithm-development': 12,
         u'algorithms': 44,
         u'amazon-ec2': 11,
         u'amazon-rds': 3,
         u'amazon-s3': 5,
         u'amazon-web-services': 25,
    

In [22]:
ds_popular_skills = sorted(ds_skill_words, key = ds_skill_words.get, reverse = True)
print ds_popular_skills[:25]

[u'r', u'python', u'machine-learning', u'data-science', u'data-analysis', u'data-mining', u'statistics', u'sql', u'microsoft-excel', u'java', u'matlab', u'python-numpy', u'data-visualization', u'python-scipy', u'sas', u'data-modeling', u'hadoop', u'tableau', u'apache-spark', u'javascript', u'predictive-analytics', u'ibm-spss', u'excel-vba', u'big-data', u'c++']


In [24]:
for skill_tuple in ds_skill_words.most_common(30):
    print skill_tuple

(u'r', 758)
(u'python', 714)
(u'machine-learning', 531)
(u'data-science', 488)
(u'data-analysis', 420)
(u'data-mining', 378)
(u'statistics', 370)
(u'sql', 292)
(u'microsoft-excel', 248)
(u'java', 201)
(u'matlab', 194)
(u'python-numpy', 166)
(u'data-visualization', 161)
(u'python-scipy', 152)
(u'sas', 131)
(u'data-modeling', 114)
(u'hadoop', 108)
(u'tableau', 106)
(u'apache-spark', 106)
(u'javascript', 106)
(u'predictive-analytics', 96)
(u'ibm-spss', 94)
(u'excel-vba', 91)
(u'big-data', 89)
(u'c++', 80)
(u'mysql', 76)
(u'data-scraping', 75)
(u'statistical-computing', 73)
(u'analytics', 70)
(u'c', 68)


## Data Engineer

In [71]:
de_profiles = json_read('../data/skills_id/data_engineer.txt')
de_profiles['title'].value_counts()


Software Engineer                                                         2007
Engineer                                                                   737
Senior Software Engineer                                                   364
Computer Engineer                                                          341
Mechanical Engineer                                                        328
Electrical Engineer                                                        301
Network Engineer                                                           248
Civil Engineer                                                             215
software engineer                                                          199
QA Engineer                                                                195
Software engineer                                                          166
System Engineer                                                            154
Software Developer                                  

In [81]:
def data_engineer_in(str):
    # Some people may not have titles in which case their titles are NaNs
    if isinstance(str, float):
        return False
    if 'data' in str.lower() and 'engineer' in str.lower() and 'entry' not in str.lower():
        return True
    else: return False

In [82]:
vect_dei = np.vectorize(data_engineer_in)
de_profiles['de_flag'] = vect_dei(np.array(de_profiles[u'title']))
print de_profiles[de_profiles['de_flag'] == True][u'title'].value_counts()
print de_profiles[de_profiles['de_flag'] == True][u'skills'][0]

Data Engineer                                                             62
Big Data Engineer                                                         20
Database Engineer                                                         11
Data engineer                                                              7
Big data engineer                                                          6
Senior Data Engineer                                                       5
Data Visualization Engineer                                                4
Data Science Engineer                                                      4
Python data engineer                                                       4
Data Network Engineer                                                      3
Big Data Software Engineer                                                 3
Data Scientist/Engineer                                                    3
Engineer in statistics and data analysis                                   3

In [83]:
skills_of_de = de_profiles[de_profiles['de_flag'] == True][u'skills']
de_skill_words = Counter()
for index, skill_set in enumerate(skills_of_de):
    if isinstance(skill_set, float):
        pass
    else: de_skill_words += Counter(skill_set)

1
5
10
23
29
61
65
72
73
80
81
85
88
93
96
97
99
102
105
108
121
122
125
149
169
172
175
178
185
187
197
203
206
207
208
209
211
212
213
215
217
226
228
232
237
240
241
243
245
249
252
258
280
282
283
287
294
313
314
316
322
325
333
337
342
345
361
364
369
374
377
380
386
396
398
408
414
417
420
430
435
437
444
445
446
451
454
456
460
461
462
469
470
477
480
488
490
492
493
497
512
513
524
526
531
537
539
542
551
568
571
584
587
594
596
598
606
609
611
612
613
614
621
626
631
633
636
639
644
647
649
659
661
662
667
670
679
680
681
685
687
690
694
709
720
721
722
725
726
728
731
737
740
748
753
754
755
757
760
763
780
789
790
791
794
795
797
799
800
802
804
805
812
813
815
816
822
829
832
835
837
839
843
846
847
848
853
855
860
862
863
864
867
872
874
877
883
885
886
887
896
908
913
923
932
933
934
964
969
972
976
991
995
998
1002
1003


In [85]:
for skill_tuple in de_skill_words.most_common(20):
    print skill_tuple

(u'python', 205)
(u'java', 163)
(u'sql', 152)
(u'microsoft-excel', 140)
(u'data-analysis', 121)
(u'hadoop', 111)
(u'data-entry', 103)
(u'machine-learning', 100)
(u'r', 100)
(u'data-mining', 88)
(u'mysql', 74)
(u'data-science', 70)
(u'microsoft-word', 69)
(u'javascript', 66)
(u'apache-spark', 64)
(u'php', 60)
(u'c#', 58)
(u'apache-hive', 52)
(u'matlab', 49)
(u'microsoft-powerpoint', 48)
(u'mongodb', 48)
(u'c', 45)
(u'etl', 43)
(u'big-data', 39)
(u'excel-vba', 38)
(u'internet-research', 38)
(u'database-design', 38)
(u'data-engineering', 38)
(u'data-warehousing', 37)
(u'c++', 37)


## Big Data

In [86]:
bd_profiles = json_read('../data/skills_id/big_data.txt')
bd_profiles['title'].value_counts()


Software Engineer                                                        143
Data Scientist                                                           132
Data Entry Professional                                                  107
Software Developer                                                        91
Virtual Assistant                                                         84
Web Developer                                                             83
Data Analyst                                                              59
Senior Software Engineer                                                  55
Data Entry                                                                38
Data Entry Specialist                                                     37
Graphic Designer                                                          34
Java Developer                                                            33
Web developer                                                             30

In [87]:
def big_data_in(str):
    # Some people may not have titles, in which case their titles are NaNs
    if isinstance(str, float):
        return False
    if 'big' in str.lower() and 'data' in str.lower():
        return True
    else: return False

In [88]:
vect_bdi = np.vectorize(big_data_in)
bd_profiles['bd_flag'] = vect_bdi(np.array(bd_profiles[u'title']))
print bd_profiles[bd_profiles['bd_flag'] == True][u'title'].value_counts()
print bd_profiles[bd_profiles['bd_flag'] == True][u'skills'][0]

Big Data Engineer                                                         20
Big Data Developer                                                        15
Big Data Consultant                                                        9
Big Data Expert                                                            7
Big Data Architect                                                         6
Big data engineer                                                          6
Big Data Analyst                                                           5
Java/Big Data Developer                                                    5
Big Data Analytics                                                         4
Bigdata Developer                                                          3
Big Data Professional                                                      3
Big Data developer                                                         3
Big Data Software Engineer                                                 3

In [89]:
skills_of_bd = bd_profiles[bd_profiles['bd_flag'] == True][u'skills']
bd_skill_words = Counter()
for index, skill_set in enumerate(skills_of_bd):
    if isinstance(skill_set, float):
        pass
    else: bd_skill_words += Counter(skill_set)

In [90]:
for skill_tuple in bd_skill_words.most_common(20):
    print skill_tuple

(u'hadoop', 331)
(u'java', 252)
(u'apache-hive', 163)
(u'big-data', 159)
(u'python', 154)
(u'apache-spark', 154)
(u'mongodb', 93)
(u'sql', 92)
(u'hbase', 87)
(u'mysql', 73)
(u'r', 67)
(u'javascript', 65)
(u'scala', 65)
(u'data-analysis', 64)
(u'pig', 63)
(u'machine-learning', 60)
(u'php', 56)
(u'cassandra', 55)
(u'mapreduce', 54)
(u'spring-framework', 54)


## Full Stack

In [91]:
fs_profiles = json_read('../data/skills_id/full_stack.txt')
fs_profiles['title'].value_counts()

Full Stack Web Developer                                              835
Full Stack Developer                                                  790
Web Developer                                                         525
Software Engineer                                                     285
Full stack web developer                                              198
Software Developer                                                    196
Full-Stack Web Developer                                              151
Full stack developer                                                  148
Web developer                                                         130
Full-Stack Developer                                                  117
Ruby on Rails Developer                                                89
Senior Software Engineer                                               87
Full-stack web developer                                               79
Senior Web Developer                  

In [92]:
def full_stack_in(str):
    # Some people may not have titles, in which case their titles are NaNs
    if isinstance(str, float):
        return False
    if 'full' in str.lower() and 'stack' in str.lower():
        return True
    else: return False

In [93]:
vect_fsi = np.vectorize(full_stack_in)
fs_profiles['fs_flag'] = vect_fsi(np.array(fs_profiles[u'title']))
print fs_profiles[fs_profiles['fs_flag'] == True][u'title'].value_counts()
print fs_profiles[fs_profiles['fs_flag'] == True][u'skills'][0]

Full Stack Web Developer                                                 835
Full Stack Developer                                                     790
Full stack web developer                                                 198
Full-Stack Web Developer                                                 151
Full stack developer                                                     148
Full-Stack Developer                                                     117
Full-stack web developer                                                  79
Full-stack Web Developer                                                  75
Full-stack developer                                                      72
Full Stack Software Engineer                                              68
Full Stack Software Developer                                             54
Full Stack Ruby on Rails Developer                                        46
Full Stack Engineer                                                       45

In [94]:
skills_of_fs = fs_profiles[fs_profiles['fs_flag'] == True][u'skills']
fs_skill_words = Counter()
for index, skill_set in enumerate(skills_of_fs):
    if isinstance(skill_set, float):
        pass
    else: fs_skill_words += Counter(skill_set)

In [95]:
for skill_tuple in fs_skill_words.most_common(20):
    print skill_tuple

(u'javascript', 5124)
(u'php', 3824)
(u'html5', 3173)
(u'jquery', 2942)
(u'angularjs', 2760)
(u'css3', 2383)
(u'node.js', 2306)
(u'mysql', 2275)
(u'css', 1723)
(u'wordpress', 1670)
(u'html', 1465)
(u'ruby-on-rails', 1426)
(u'java', 1423)
(u'python', 1408)
(u'mongodb', 1222)
(u'twitter-bootstrap', 889)
(u'laravel-framework', 854)
(u'ruby', 788)
(u'sql', 776)
(u'c#', 753)
