In [339]:
import json
import pandas as pd
import numpy as np
from collections import Counter
import cPickle as pickle
from sklearn.feature_extraction.text import CountVectorizer
import time
import datetime
import re 

Get freelancer profile by key

Endpoint
GET /api/profiles/v1/providers/{profile_key}.{format}

In [2]:
def json_prep(in_file):
    # read the entire file `ainto a python array
    with open(in_file, 'rb') as f:
        data = f.readlines()

    # remove the trailing "\n" from each line
    data = map(lambda x: x.rstrip(), data)

    # each element of 'data' is an individual JSON object.
    # i want to convert it into an *array* of JSON objects
    # which, in and of itself, is one large JSON object
    # basically... add square brackets to the beginning
    # and end, and have all the individual business JSON objects
    # separated by a comma
    data_json_str = "[" + ','.join(data) + "]"

    data_list_of_dicts = json.loads(data_json_str)
    
    out_df = pd.read_json(data_json_str)
    return out_df

In [3]:
def triage(dict_, level=0):
    if not(isinstance(dict_, float) or isinstance(dict_, basestring)):
        print type(dict_)
        for key, val in dict_.iteritems():
            print ' '*level*2, key
            if isinstance(val, dict):
                triage(val, level=level+1)
            elif isinstance(val, list):
                for index, item in enumerate(val): 
                    print ' '*level*2, 'item', index
                    triage(item, level=level+1)
            elif isinstance(val, basestring):
                print ' '*(level+1)*2, '=', val

## Make pandas DataFrame out of detailed profiles data for exploration

In [4]:
as_profiles_df = json_prep('../../data/detailed_profiles_da_0.txt')

In [5]:
as_profiles_df.T.head(48)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39986,39987,39988,39989,39990,39991,39992,39993,39994,39995
ag_cny_recno,,2.45439e+06,,,,,,315411,,35711,...,,,,,,460900,,,,
ag_country,,United States,,,,,,India,,Russia,...,,,,,,India,,,,
ag_country_tz,,United States (UTC-05:00),,,,,,India (UTC+05:30),,Russia (UTC+03:00),...,,,,,,India (UTC+05:30),,,,
ag_description,,,,,,,,"esoftwaresolutions provides you Web Desiging, ...",,We are developing with Unity3D for years. \n\n...,...,,,,,,We have a team of excellent people who are exp...,,,,
ag_logo,,,,,,,,,,https://odesk-prod-portraits.s3.amazonaws.com/...,...,,,,,,https://odesk-prod-portraits.s3.amazonaws.com/...,,,,
ag_name,,Future Field Solutions,,,,,,esoftwaresolutions,,Vita-Mobile,...,,,,,,Technozee,,,,
ag_recent_hours,,0,,,,,,0,,4,...,,,,,,0,,,,
ag_total_hours,,0,,,,,,0,,8091.67,...,,,,,,0,,,,
agency_ciphertext,,~0121296613c357f4c1,,,,,,~01585171932b8b7e79,,~01e507047c43de815e,...,,,,,,~01c49d2c41510d4e29,,,,
assignments,"{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': {u'job': {u'as_total_hours': u'24', u'...","{u'hr': {u'job': {u'as_total_hours': u'11', u'...","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': {u'job': [{u'as_to': u'02/2016', u'fee...",...,"{u'hr': u'', u'fp': u''}","{u'hr': {u'job': {u'as_total_hours': u'3.50', ...","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': {u'job': [{u'as_to': u'01/...","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': {u'job': [{u'as_to': u'03/2014', u'as_...","{u'hr': u'', u'fp': u''}"


In [6]:
# print as_profiles_df.dev_job_categories_v2.value_counts()

In [7]:
# print as_profiles_df.dev_ac_agencies.value_counts()

In [8]:
# print as_profiles_df.dev_ui_profile_access.value_counts()

## Make analysis data set

Drop agency information. Not going to attempt to assign effect to agency due to low participation frequency in agencies in general and most agencies have few people, so will not have sufficient power.

In [9]:
as_profiles_df.drop(['ag_cny_recno', 'ag_country', 'ag_country_tz', \
                     'ag_description', 'ag_logo', 'ag_name', 'ag_recent_hours', \
                     'ag_total_hours', 'agency_ciphertext' \
                    ], axis=1, inplace=True)

Drop redundant variables.

In [10]:
as_profiles_df.drop(['dev_portrait_100', 'dev_portrait_32', 'dev_portrait_50', 'dev_recno' \
                    ], axis=1, inplace=True)

Drop obviously or inherently irrelevant variables.

In [11]:
as_profiles_df.drop(['dev_ui_profile_access', 'permalink'], \
                    axis=1, inplace=True)

Drop variables with data leak.

In [12]:
as_profiles_df.drop([ \
                     'dev_adj_score', 'dev_adj_score_recent', \
                     'dev_billed_assignments', 'dev_last_activity', 'dev_last_worked', \
                     'dev_last_worked_ts', 'dev_portfolio_items_count', \
                     'dev_tot_feedback', 'dev_total_hours' \
                    ], axis=1, inplace=True)

Drop variables that may be good to use, but for which I do not currently have the time to work up.

In [13]:
as_profiles_df.drop(['dev_city', 'dev_first_name', \
                     'dev_last_name',  'dev_short_name' \
                    ], axis=1, inplace=True)

Drop variables that seem useful, but do not appear to someone looking at the profile, so are therefore non-factors in evaluating the profile.

In [14]:
as_profiles_df.drop(['dev_job_categories_v2', 'job_categories', \
                    ], axis=1, inplace=True)

In [15]:
as_profiles_df.T.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39986,39987,39988,39989,39990,39991,39992,39993,39994,39995
assignments,"{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': {u'job': {u'as_total_hours': u'24', u'...","{u'hr': {u'job': {u'as_total_hours': u'11', u'...","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': {u'job': [{u'as_to': u'02/2016', u'fee...",...,"{u'hr': u'', u'fp': u''}","{u'hr': {u'job': {u'as_total_hours': u'3.50', ...","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': {u'job': [{u'as_to': u'01/...","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': {u'job': [{u'as_to': u'03/2014', u'as_...","{u'hr': u'', u'fp': u''}"
ciphertext,~01908982a5cfb82eaa,~01920da34c906bad41,~0122a64055c92a2c52,~01f8cfc8ea8a5914c6,~01e87b687c5284c667,~011aa28ecb946e0544,~015486f00acbf5d470,~014d1f20dc899e576f,~0156f699b56cfb03f2,~01042e67587727cfc6,...,~0127a61e2dec79dd9b,~018d0212bb33fde23d,~018e0519190864f991,~01e152bfb7d2af51f9,~01be5587fac9d9055c,~011ac73c6e3fefa6dc,~01bf08412ce5c1b091,~01f9b38e4a1ded6344,~01a9ca216aec8cc9bc,~01e5218c44b30efff4
dev_ac_agencies,,{u'dev_ac_agency': {u'ag_name': u'Future Field...,,,,,,{u'dev_ac_agency': {u'ag_name': u'esoftwaresol...,,{u'dev_ac_agency': {u'ag_name': u'Vita-Mobile'...,...,,,,,,"{u'dev_ac_agency': {u'ag_name': u'Technozee', ...",,,,
dev_bill_rate,10,75,10,16.67,14.44,157.5,88.89,3,15,19,...,8,35,30,25,8.33,10,12,10,15,111.11
dev_blurb,I'm a web developer.\n\nCurrently able to prog...,"I am a full stack developer, proficient in des...",Hi!\n\nMy name is Linh (Linh Nguyen Viet) and ...,•\tFast learning capability of any new cutting...,Over last 7 years I have been developing a wid...,I am an experienced technologist of Linux and ...,International Technical Lead of several Teams ...,I want to secure a job as a Data Entry Operato...,I am full stack web developer with 10 years of...,Hello dear. \n\nMy main focus is backend devel...,...,"Hi, I am Sharjeel and I am new to Upwork. My e...",I have a degree in Mathematics from New York U...,I am an enthusiastic and passionate jQuery / P...,Self-directed and motivated technical project ...,"working knowledge of asp.net C#, MS SQL sever...",We have a team of excellent people who are exp...,"Hi, My name is Alex Badmashkaev.\n\nMy approac...",Extensive experience in design and development...,Over the last 12 years I have developed a wide...,I help companies reach their full potential! ...
dev_country,Indonesia,United States,Vietnam,India,Moldova,Australia,Germany,India,Serbia,Russia,...,Pakistan,United States,United States,France,India,India,Russia,India,Hungary,United States
dev_eng_skill,4,5,,5,3,5,,5,4,4,...,5,5,5,5,5,5,4,4,4,5
dev_groups,,,,,,,,,,,...,,,,,,,,,,
dev_is_affiliated,0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0
dev_portrait,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...


`ciphertext` and `dev_recno_ciphertext`

In [16]:
equality_of_ciphertext = as_profiles_df['ciphertext'] == as_profiles_df['dev_recno_ciphertext']

In [17]:
if all(equality_of_ciphertext) == True:
    print 'All ciphertext == dev_recno_ciphertext'
    as_profiles_df.drop(['ciphertext'], axis=1, inplace=True)

All ciphertext == dev_recno_ciphertext


`dev_ac_agencies`: presence or absence

In [18]:
as_profiles_df['agency_affl'] = as_profiles_df['dev_ac_agencies'] <> ''
print as_profiles_df['agency_affl'].value_counts(dropna=False)

False    34765
True      5231
Name: agency_affl, dtype: int64


`dev_bill_rate`

In [19]:
# as_profiles_df.dev_bill_rate.describe()

`dev_blurb`

In [20]:
as_profiles_df['dev_blurb'][:5]

0    I'm a web developer.\n\nCurrently able to prog...
1    I am a full stack developer, proficient in des...
2    Hi!\n\nMy name is Linh (Linh Nguyen Viet) and ...
3    •\tFast learning capability of any new cutting...
4    Over last 7 years I have been developing a wid...
Name: dev_blurb, dtype: object

`dev_country`: leaving it alone for now, will study it in the analysis

In [21]:
def pct_freq(series):
    return 1. * series.value_counts() / len(series)
print pct_freq(as_profiles_df['dev_country'])

India                                   0.270627
United States                           0.130813
Pakistan                                0.060106
Russia                                  0.052155
Philippines                             0.049530
Ukraine                                 0.049180
Bangladesh                              0.042104
United Kingdom                          0.021402
Canada                                  0.019252
Brazil                                  0.013251
Egypt                                   0.012651
Romania                                 0.010851
Indonesia                               0.010026
Australia                               0.009951
Germany                                 0.009851
China                                   0.009126
Poland                                  0.008926
Sri Lanka                               0.008801
Belarus                                 0.008676
Serbia                                  0.008226
Kenya               

`dev_eng_skill`: leaving it alone for now, will study it in the analysis

In [22]:
as_profiles_df.dev_eng_skill.value_counts(dropna=False)

5    23319
4     8274
      5139
2     1833
3     1282
1      149
Name: dev_eng_skill, dtype: int64

`dev_groups`

In [23]:
as_profiles_df['group_affl'] = as_profiles_df['dev_groups'] <> ''
print as_profiles_df['group_affl'].value_counts(dropna=True)

False    39422
True       574
Name: group_affl, dtype: int64


In [24]:
pd.crosstab(as_profiles_df['agency_affl'], as_profiles_df['group_affl'], dropna=False)

Unnamed: 0_level_0,False,True
agency_affl,Unnamed: 1_level_1,Unnamed: 2_level_1
False,34453,312
True,4969,262


`dev_portrait`: presence or absence

In [25]:
as_profiles_df['has_portrait'] = as_profiles_df['dev_portrait'] <> ''
as_profiles_df['has_portrait'].value_counts(dropna=True)

True     34694
False     5302
Name: has_portrait, dtype: int64

`dev_profile_title`

In [26]:
as_profiles_df['dev_profile_title'][:5]

0                          PHP Web Developer
1              Full Stack Software Developer
2               Backend & Frontend developer
3                         Software Developer
4    C#, ASP.NET, PHP, JavaScript, MVC C/C++
Name: dev_profile_title, dtype: object

`dev_timezone`

In [27]:
as_profiles_df['dev_timezone'].value_counts(dropna=True)

UTC+05:30 Mumbai, Kolkata, Chennai, New Delhi                   8578
UTC-05:00 Eastern Time (US & Canada)                            2936
UTC+01:00 Berlin, Stockholm, Rome, Bern, Brussels               2847
UTC (Coordinated Universal Time)                                2489
UTC+02:00 Israel                                                2329
UTC+05:00 Islamabad, Karachi                                    2145
UTC-08:00 Pacific Time (US & Canada); Tijuana                   2097
UTC+06:00 Almaty, Dhaka                                         1763
UTC+08:00 Krasnoyarsk                                           1398
UTC-06:00 Central Time (US & Canada)                            1246
UTC+03:00 Baghdad, Kuwait, Nairobi, Riyadh                      1233
UTC (no DST) Tangiers, Casablanca                                816
UTC+04:00 Abu Dhabi, Muscat, Tbilisi, Kazan                      772
UTC+02:00 Eastern Europe                                         731
UTC+08:00 Hong Kong SAR, Perth, Si

## Multi variables

### `assignments`:

In [28]:
def grab_data(datalist, creature, id_, list_of_keys):        
    if isinstance(creature, basestring):
        pass
    elif isinstance(creature, dict):
        tuple_to_add = tuple([id_] + [creature.get(key, '') for key in list_of_keys])
        datalist.append(tuple_to_add)
    elif isinstance(creature, list):
        for creature_item in creature:
            tuple_to_add = tuple([id_] + [creature_item.get(key, '') for key in list_of_keys])
            datalist.append(tuple_to_add)

In [77]:
jobs_tuples = list()
key_list = ['as_opening_title', 'as_from_full', 'as_to_full', \
            'as_rate', 'as_total_hours_precise', 'as_total_charge', 'as_job_type']

for index, assignments_tuple in enumerate(as_profiles_df[['dev_recno_ciphertext', 'assignments']].itertuples(index=False)):
    profile_id = assignments_tuple[0]
    assignments = assignments_tuple[1]
    if 'hr' in assignments:
        hr_assignments = assignments['hr']
        if 'job' in hr_assignments:
            grab_data(jobs_tuples, hr_assignments['job'], profile_id, key_list)
        elif 'assignment' in hr_assignments:
            grab_data(jobs_tuples, hr_assignments['assignment'], profile_id, key_list)
    
    if 'fp' in assignments:
        fp_assignments = assignments['fp']
        if 'job' in fp_assignments:
            grab_data(jobs_tuples, fp_assignments['job'], profile_id, key_list)
        elif 'assignment' in fp_assignments:
            grab_data(jobs_tuples, fp_assignments['assignment'], profile_id, key_list)

In [170]:
jobs_df = pd.DataFrame(jobs_tuples)
jobs_df.columns = ['dev_recno_ciphertext'] + key_list

In [312]:
jobs_df.head(40)

Unnamed: 0,dev_recno_ciphertext,as_opening_title,as_from_full,as_to_full,as_rate,as_total_hours_precise,as_total_charge,as_job_type
0,~01e87b687c5284c667,Multiwonen upgrade website,,01/13/2014,,24.1666666666666,,Hourly
1,~011aa28ecb946e0544,System & security,,Present,,10.5,,Hourly
2,~01042e67587727cfc6,Interview for Scala AWS devops job,,02/29/2016,,4.0,,Hourly
3,~01042e67587727cfc6,System consultation,,10/20/2013,,10.1666666666667,,Hourly
4,~01042e67587727cfc6,Game Technical Specifications Developer,02/25/2008,03/02/2008,$16.67,21.0,350.07,Hourly
5,~01042e67587727cfc6,Software Development/Desktop Applications job,11/20/2006,01/16/2007,$12.10,24.1666666666667,292.42,Hourly
6,~01042e67587727cfc6,Software Development/Desktop Applications job,03/13/2006,11/19/2006,$13.00,670.666666666666,19783.17,Hourly
7,~01042e67587727cfc6,Game Framework Document Designer needed.,01/30/2012,12/16/2015,$0.00,0.0,1000.0,Fixed
8,~01042e67587727cfc6,IPad App Edits - Modifications for new iOS and...,,01/30/2015,,0.0,,Fixed
9,~01042e67587727cfc6,Darby Design,,03/06/2012,,0.0,,Fixed


In [311]:
jobs_df['dev_recno_ciphertext'].nunique()

10490

In [338]:
def blank_or_zero(str):
    if str == '':
        return True
    else:
        match = re.search(r'\d+.*\d*', str)
        number = float(match.group())
        if number == 0:
            return True
        else:
            return False

def get_number(str):
    return float(re.search(r'\d+.*\d*', str).group())

In [340]:
jobs_df.columns

Index([u'dev_recno_ciphertext', u'as_opening_title', u'as_from_full',
       u'as_to_full', u'as_rate', u'as_total_hours_precise',
       u'as_total_charge', u'as_job_type'],
      dtype='object')

In [346]:
hr_good_jobs = jobs_df[(jobs_df['as_job_type'] == 'Hourly') & ~(jobs_df['as_rate'].apply(blank_or_zero))]
hr_good_jobs.drop()
hr_good_jobs.drop([ \
                     'as_opening_title', 'as_job_type' \
                  ], axis=1, inplace=True)

Unnamed: 0,dev_recno_ciphertext,as_opening_title,as_from_full,as_to_full,as_rate,as_total_hours_precise,as_total_charge,as_job_type
4,~01042e67587727cfc6,Game Technical Specifications Developer,02/25/2008,03/02/2008,$16.67,21,350.07,Hourly
5,~01042e67587727cfc6,Software Development/Desktop Applications job,11/20/2006,01/16/2007,$12.10,24.1666666666667,292.42,Hourly
6,~01042e67587727cfc6,Software Development/Desktop Applications job,03/13/2006,11/19/2006,$13.00,670.666666666666,19783.17,Hourly
11,~01dae1a4d7abad2d0e,VPS Set-up for a website,,04/13/2016,$20.00,3.83333333333333,76.67,Hourly
13,~01dae1a4d7abad2d0e,Need a WordPress optimization wizard and Linux...,,03/30/2016,$20.00,5,100,Hourly
15,~01dae1a4d7abad2d0e,Server Not Sending Emails,,03/23/2016,$20.00,2,40,Hourly
20,~01dae1a4d7abad2d0e,502 Errors with plesk API,,08/27/2015,$18.00,3,54,Hourly
26,~01c90154cc25372281,AI PHP Programmer,,Present,$20.00,439.333333333333,88383.95,Hourly
33,~01c90154cc25372281,PHP developer,,08/11/2014,$20.00,28.1666666666665,563.33,Hourly
36,~01c90154cc25372281,PHP/MySQL Programmer,,03/20/2013,$16.67,3650.5,60853.71,Hourly


### `education`:

In [None]:
# EDA

In [120]:
# degrees_tuples = list()
# for index, education_tuple in enumerate(as_profiles_df[['dev_recno_ciphertext', 'education']].itertuples(index=False)):
#     profile_id = education_tuple[0]
#     schools = education_tuple[1]
#     if isinstance(schools, basestring):
#         degrees_tuples.append((profile_id, 'Other'))
#     else:
#         for key in schools.keys():
#             if key == u'institution':
#                 school_institution = schools['institution']
#                 grab_data(degrees_tuples, school_institution, profile_id, ['ed_degree'])
    

In [121]:
# degrees_df = pd.DataFrame(degrees_tuples)
# degrees_df.columns = ['ciphertext', 'ed_degree']
# # degrees_df.head()
# print degrees_df['ed_degree'].value_counts()

Other                                                 7037
                                                      6948
Bachelor of Engineering (B.Eng.)                      5232
Bachelors                                             3748
Master of Computer Applications (M.C.A.)              3127
Bachelor of Science (B.S.)                            2936
Bachelor's degree                                     2433
Masters                                               2109
Bachelor of Technology (B.Tech.)                      1703
Master of Science (M.S.)                              1484
Master's degree                                       1428
Bachelor of Applied Science (B.A.Sc.)                 1326
Engineer's degree                                     1201
High school degree                                     986
Diploma                                                973
High School                                            950
Master of Business Administration (M.B.A.)             9

In [122]:
def degree_categ(str):
    if 'master' in str.lower():
        return 'Master'
    elif 'bachelor' in str.lower():
        return 'Bachelor'
    elif 'doctor' in str.lower():
        return 'Doctor'
    elif 'b.' in str.lower():
        return 'Bachelor'
    elif 'm.' in str.lower():
        return 'Master'
    elif 'mba' in str.lower():
        return 'Master'
    elif 'msc' in str.lower():
        return 'Master'
    elif 'bsc' in str.lower():
        return 'Bachelor'
    elif 'bs' in str.lower():
        return 'Bachelor'
    elif 'ba ' in str.lower():
        return 'Bachelor'
    elif 'ms' in str.lower():
        return 'Master'
    elif 'ma' in str.lower():
        return 'Master'
    elif 'phd' in str.lower():
        return 'Doctor'
    elif 'ph.d' in str.lower():
        return 'Doctor'
    elif 'engineer' in str.lower():
        return 'Engineer'
    else:
        return 'Other'
vect_deg_cat = np.vectorize(degree_categ)
degrees_df['ed_degree_cat'] = vect_deg_cat(np.array(degrees_df['ed_degree']))

In [123]:
# degrees_df['ed_degree_cat'].value_counts()

Bachelor    20633
Other       16905
Master      11666
Engineer     1242
Doctor        717
Name: ed_degree_cat, dtype: int64

In [136]:
def degree_score(ed_degree):
    if degree_categ(ed_degree) == 'Bachelor':
        return 1
    elif degree_categ(ed_degree) == 'Engineer':
        return 2
    elif degree_categ(ed_degree) == 'Master':
        return 3
    elif degree_categ(ed_degree) == 'Doctor':
        return 4
    elif degree_categ(ed_degree) == 'Other':
        return 0

In [159]:
def grab_max_data(datalist, creature, id_, max_func, key_to_be_maxed, list_of_keys):        
    if isinstance(creature, basestring):
        tuple_to_add = tuple((id_, 'Other'))
    elif isinstance(creature, dict):
        tuple_to_add = tuple([id_] + [creature.get(key, '') for key in list_of_keys])
        datalist.append(tuple_to_add)
    elif isinstance(creature, list):
        curr_score = -1
        for creature_item in creature:
            score = max_func(creature_item.get(key_to_be_maxed, ''))
            if score > curr_score:
                tuple_to_beat = tuple([id_] + [creature_item.get(key, '') for key in list_of_keys])
                curr_score = score
        tuple_to_add = tuple_to_beat
        datalist.append(tuple_to_add)

In [160]:
degrees_tuples = list()
for index, education_tuple in \
  enumerate(as_profiles_df[['dev_recno_ciphertext', 'education']].itertuples(index=False)):
    profile_id = education_tuple[0]
    schools = education_tuple[1]
    if isinstance(schools, basestring):
        degrees_tuples.append((profile_id, 'Other'))
    else:
        for key in schools.keys():
            if key == u'institution':
                school_institution = schools['institution']
                grab_max_data(degrees_tuples, school_institution, profile_id,  \
                              degree_score, 'ed_degree', ['ed_degree'])


In [216]:
for index, tup in enumerate(degrees_tuples):
    print tup
    if index > 4: break


(u'~01908982a5cfb82eaa', u"Engineer's degree")
(u'~01920da34c906bad41', u'Bachelor of Science (B.S.)')
(u'~0122a64055c92a2c52', u'Master of Computer Applications (M.C.A.)')
(u'~01f8cfc8ea8a5914c6', u'Bachelor of Technology (B.Tech.)')
(u'~01e87b687c5284c667', u'Masters')
(u'~011aa28ecb946e0544', u'Bachelor of Science (B.S.)')


In [218]:
degrees_df = pd.DataFrame(degrees_tuples)
degrees_df.columns = ['dev_recno_ciphertext'] + ['highest_degree']

In [222]:
# print as_profiles_df[['dev_recno_ciphertext', 'education']].head(20)
# print degrees_df.head(20)

   dev_recno_ciphertext                                          education
0   ~01908982a5cfb82eaa  {u'institution': [{u'ed_to': u'12/2015', u'ed_...
1   ~01920da34c906bad41  {u'institution': {u'ed_to': u'12/2012', u'ed_a...
2   ~0122a64055c92a2c52  {u'institution': {u'ed_to': u'01/2009', u'ed_a...
3   ~01f8cfc8ea8a5914c6  {u'institution': {u'ed_to': u'12/2014', u'ed_a...
4   ~01e87b687c5284c667  {u'institution': {u'ed_to': u'01/2005', u'ed_a...
5   ~011aa28ecb946e0544  {u'institution': {u'ed_to': u'12/1997', u'ed_a...
6   ~015486f00acbf5d470  {u'institution': {u'ed_to': u'12/2004', u'ed_a...
7   ~014d1f20dc899e576f                                                   
8   ~0156f699b56cfb03f2  {u'institution': {u'ed_to': u'01/2004', u'ed_a...
9   ~01042e67587727cfc6  {u'institution': {u'ed_to': u'09/2004', u'ed_a...
10  ~019d009298bfe5fcf9  {u'institution': {u'ed_to': u'12/2006', u'ed_a...
11  ~015dff07c056daa14d  {u'institution': [{u'ed_to': u'12/2011', u'ed_...
12  ~010c785da058e8bb32  

In [223]:
# print as_profiles_df[['dev_recno_ciphertext', 'education']].tail(20)
# print degrees_df.tail(20)

      dev_recno_ciphertext                                          education
39976  ~01f521b5da31ef1e19  {u'institution': {u'ed_to': u'Present', u'ed_a...
39977  ~012474c494f265aa66  {u'institution': {u'ed_to': u'12/2012', u'ed_a...
39978  ~0120515bbf87381857                                                   
39979  ~0148ccaa5913cb9f91  {u'institution': [{u'ed_to': u'12/2015', u'ed_...
39980  ~0108c6f18693abe39d  {u'institution': {u'ed_to': u'12/2015', u'ed_a...
39981  ~017a6e71c528c7f2f2  {u'institution': {u'ed_to': u'12/2017', u'ed_a...
39982  ~0152d7d24626735644                                                   
39983  ~01293468dab085e9d3                                                   
39984  ~01c22e71edca30a1a6  {u'institution': {u'ed_to': u'01/2018', u'ed_a...
39985  ~015930f20f5f3ad71a  {u'institution': {u'ed_to': u'04/2000', u'ed_a...
39986  ~0127a61e2dec79dd9b  {u'institution': {u'ed_to': u'12/2015', u'ed_a...
39987  ~018d0212bb33fde23d  {u'institution': {u'ed_to': u'12/201

In [220]:
# degrees_df['highest_degree'].value_counts()

Other                                                 6380
                                                      4866
Bachelor of Engineering (B.Eng.)                      4407
Master of Computer Applications (M.C.A.)              3005
Bachelors                                             2491
Bachelor of Science (B.S.)                            2133
Masters                                               1919
Bachelor's degree                                     1857
Bachelor of Technology (B.Tech.)                      1519
Master's degree                                       1294
Master of Science (M.S.)                              1246
Engineer's degree                                     1034
Bachelor of Applied Science (B.A.Sc.)                  978
Master of Business Administration (M.B.A.)             846
Diploma                                                703
Bachelor's                                             661
Doctor of Philosophy (Ph.D.)                           5

Add `highest_degree` to analysis data set

In [None]:
as_profiles_df['highest_degree'] = degrees_df['highest_degree']

### `experiences` will be skipped because of uncertain usefulness and lack of time

### `portfolio_items`: presence or absence

In [165]:
as_profiles_df['has_portfolio'] = as_profiles_df['portfolio_items'] <> ''
print as_profiles_df['has_portfolio'].value_counts(dropna=True)

False    32325
True      7671
Name: has_portfolio, dtype: int64


### `skills`

In [264]:
skills_tuples = list()
key_list = ['skl_name']

for index, skills_tuple in enumerate(as_profiles_df[['dev_recno_ciphertext', 'skills']].itertuples(index=False)):
    profile_id = skills_tuple[0]
    skills = skills_tuple[1]
    skills_list = list()
    if 'skill' in skills:
        grab_data(skills_tuples, skills['skill'], profile_id, key_list)
        

In [265]:
skills_df = pd.DataFrame(skills_tuples)
skills_df.columns = ['dev_recno_ciphertext'] + key_list

In [297]:
def multiple_replace(dict, text):
    # Make one-letter skills longer
    if text == 'c':
        longtext = 'cprog'
    elif text == 'r':
        longtext = 'rprog'
    elif text == 's':
        longtext = 'sprog'
    else:
        longtext = text

    regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

    # For each match, look-up corresponding value in dictionary
    return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], longtext) 

substitution_dict = {
    '+': 'plus',
    '#': 'sharp',
    '/': 'slash',
    '-': 'dash',
    '.': 'dot'
}

def contiguous_word(skill):
    return multiple_replace(substitution_dict, skill)

In [298]:
skills_content_list = list()
key_list = ['skills_string']
start_tuple_number = 0
end_tuple_number = len(skills_tuples) - 1

for index, skills_tuple in enumerate(skills_tuples):
    if index == start_tuple_number:
        curr_ciphertext = skills_tuple[0]
        skills_string = contiguous_word(skills_tuple[1])
    elif curr_ciphertext == skills_tuple[0]:
        skills_string += (' ' + contiguous_word(skills_tuple[1]) )  
    elif curr_ciphertext <> skills_tuple[0]:
        skills_content_list.append((curr_ciphertext, skills_string))
        curr_ciphertext = skills_tuple[0]
        skills_string = contiguous_word(skills_tuple[1])
        
    if index == end_tuple_number:
        skills_content_list.append((curr_ciphertext, skills_string))        


In [299]:
skills_content_df = pd.DataFrame(skills_content_list)
skills_content_df.columns = ['dev_recno_ciphertext'] + key_list

Add `skills_string` to analysis data set

In [300]:
as_profiles_df['skills_string'] = skills_content_df['skills_string']

In [301]:
as_profiles_df.T.head(25)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39986,39987,39988,39989,39990,39991,39992,39993,39994,39995
assignments,"{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': {u'job': {u'as_total_hours': u'24', u'...","{u'hr': {u'job': {u'as_total_hours': u'11', u'...","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': {u'job': [{u'as_to': u'02/2016', u'fee...",...,"{u'hr': u'', u'fp': u''}","{u'hr': {u'job': {u'as_total_hours': u'3.50', ...","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': {u'job': [{u'as_to': u'01/...","{u'hr': u'', u'fp': u''}","{u'hr': u'', u'fp': u''}","{u'hr': {u'job': [{u'as_to': u'03/2014', u'as_...","{u'hr': u'', u'fp': u''}"
dev_ac_agencies,,{u'dev_ac_agency': {u'ag_name': u'Future Field...,,,,,,{u'dev_ac_agency': {u'ag_name': u'esoftwaresol...,,{u'dev_ac_agency': {u'ag_name': u'Vita-Mobile'...,...,,,,,,"{u'dev_ac_agency': {u'ag_name': u'Technozee', ...",,,,
dev_bill_rate,10,75,10,16.67,14.44,157.5,88.89,3,15,19,...,8,35,30,25,8.33,10,12,10,15,111.11
dev_blurb,I'm a web developer.\n\nCurrently able to prog...,"I am a full stack developer, proficient in des...",Hi!\n\nMy name is Linh (Linh Nguyen Viet) and ...,•\tFast learning capability of any new cutting...,Over last 7 years I have been developing a wid...,I am an experienced technologist of Linux and ...,International Technical Lead of several Teams ...,I want to secure a job as a Data Entry Operato...,I am full stack web developer with 10 years of...,Hello dear. \n\nMy main focus is backend devel...,...,"Hi, I am Sharjeel and I am new to Upwork. My e...",I have a degree in Mathematics from New York U...,I am an enthusiastic and passionate jQuery / P...,Self-directed and motivated technical project ...,"working knowledge of asp.net C#, MS SQL sever...",We have a team of excellent people who are exp...,"Hi, My name is Alex Badmashkaev.\n\nMy approac...",Extensive experience in design and development...,Over the last 12 years I have developed a wide...,I help companies reach their full potential! ...
dev_country,Indonesia,United States,Vietnam,India,Moldova,Australia,Germany,India,Serbia,Russia,...,Pakistan,United States,United States,France,India,India,Russia,India,Hungary,United States
dev_eng_skill,4,5,,5,3,5,,5,4,4,...,5,5,5,5,5,5,4,4,4,5
dev_groups,,,,,,,,,,,...,,,,,,,,,,
dev_is_affiliated,0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0
dev_portrait,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...,https://odesk-prod-portraits.s3.amazonaws.com/...
dev_profile_title,PHP Web Developer,Full Stack Software Developer,Backend & Frontend developer,Software Developer,"C#, ASP.NET, PHP, JavaScript, MVC C/C++",Devops Engineer,Chief Technology Officer & Software Architect,WEB DEVELOPER AND DATA ENTRY SPECIALITY,Full Stack Web Developer,Backend / Unity3d software developer,...,Software Engineer,"Programmer, financial analyst, mathematician",I am a multi talented enthusiastic IT professi...,Full stack web developer,Software Developer,"Experienced in Web, Windows & Database Develop...",Developer. Web Technology Specialist.,web developer,Well grounded (Java and Python) developer and ...,Analyst


In [302]:
# as_profiles_df[['dev_recno_ciphertext', 'skills_string']]

In [303]:
skill_vectorizer = CountVectorizer() 
sv_sparse_matrix = skill_vectorizer.fit_transform(as_profiles_df['skills_string']) 

In [304]:
frequencies = sum(sv_sparse_matrix).toarray()[0] 
check_work = pd.DataFrame(frequencies, index=skill_vectorizer.get_feature_names(), columns=['frequency'])

In [305]:
check_work.head(510)

Unnamed: 0,frequency
1shoppingcart,7
2ddashanimation,17
2ddashdesign,37
3ddashanimation,42
3ddashdesign,64
3ddashmodeling,86
3ddashprinting,15
3ddashrendering,22
3ddashrigging,12
3ddashscanning,1


### `tsexams`: presence or absence

In [None]:
as_profiles_df['took_tests'] = as_profiles_df[''] <> ''
as_profiles_df['took_tests'].value_counts(dropna=True)

### Drop variables from which other variables have been derived.

In [None]:
as_profiles_df.drop([ \
                     'dev_ac_agencies', 'dev_groups', 'dev_is_affiliated' \
                    ], axis=1, inplace=True)

### Drop aggregate variables

as_profiles_df.drop([ \
                      'assignments', 'experiences', 'portfolio_items', 'skills', 'tsexams' \
                     ], axis=1, inplace=True)

# Make pickles of new DataFrames

In [None]:
outfile_jobs = '../../data/jobs_df.pkl'
outfile_skills = '../../data/skills_df.pkl'

f = open(outfile_jobs, 'wb')
pickle.dump(jobs_df, f) 
f.close()           

f = open(outfile_skills, 'wb')
pickle.dump(skills_df, f) 
f.close()           

In [None]:
# def is_fraud_in(str):
#     return True if 'fraud' in str else False
# vect_in_fraud = np.vectorize(is_fraud_in)
# df['is_fraud'] = vect_in_fraud(np.array(df['acct_type']))

In [None]:
# pd.crosstab(df['is_fraud'], df['acct_type'])

In [None]:
# def make_date(epoch_date):
#     return time.localtime(epoch_date)
# vect_ce = np.vectorize(make_date)

In [None]:
# sns.violinplot(df['is_fraud'], df['body_length'])

In [None]:
# df['channels_0'] = df['channels'] == 0
# df['channels_5'] = df['channels'] == 5
# df['channels_8'] = df['channels'] == 8
# df['channels_11'] = df['channels'] == 11
# df['channels_other'] = ~(df['channels_0'] | df['channels_5'] | df['channels_8'] | df['channels_11'])