In [1]:
import csv
from collections import defaultdict
import numpy as np
import statsmodels.api as sm
from scipy.stats import linregress
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline

  from pandas.core import datetools


In [2]:
def import_csv(file_name,delimiter=","):
    with open(file_name,"r") as f:
        return list(csv.reader(f,delimiter=delimiter))
original_work_context=import_csv("OriginalONetData/WorkContext.TXT",delimiter="\t")
current_work_context=import_csv("CurrentONetData/Work Context.csv")
original_jobs_list = set(a[0] for a in original_work_context)
new_jobs_list = set(a[0] for a in current_work_context)
shared_jobs_list = original_jobs_list.intersection(new_jobs_list)

v1 = [a for a in original_work_context if a[0] in shared_jobs_list][1:]
v2 = [a for a in current_work_context if a[0] in shared_jobs_list and a[4]=="CX"]

original_contexts=set([v[1] for v in v1])
new_contexts=set([v[2] for v in v2])
shared=original_contexts.intersection(new_contexts) # Only unchanged job identifierstt

joint_v1 = [v for v in v1 if v[1] in shared] # Only pick the job contexts that were present
joint_v2 = [v for v in v2 if v[2] in shared] # in both the latest file and the earliest.

sliced_v1=[[v[0],v[1],v[2],v[4]] for v in joint_v1] # Extracts only the relevant information
sliced_v2=[[v[0],v[2],v[3],v[7],v[14]] for v in joint_v2] # Adds "year last updated" to the above value.

doa_figure1 = [v for v in sliced_v1 if v[1]=="4.C.3.b.2"] # 4.C.3.b.2 is Level of Automation
doa_figure2 = [v for v in sliced_v2 if v[1]=="4.C.3.b.2"]

recently_updated_jobs_codes=[n[0] for n in doa_figure2 if n[4][5]=="1"] # Selects for 2010 or later.

In [3]:
# In Which I Import Education measurements.
original_education=import_csv("2003ONetData/db_50/EducTrainExp.txt", delimiter="\t")
rough_jobs_education = defaultdict(list)
for job in sorted(list(shared_jobs_list))[:-1]:
    for point in original_education:
        if point[0]==job:
            rough_jobs_education[job].append(point[3:])
clean_job_education=defaultdict(list)
for key,val in rough_jobs_education.items():
    # RL: Required Level of Education
    # RW: Related Work Experience
    # PT: On-Site or In-Plant Training
    # OJ: On-the-Job Training
    t=dict()
    def summer(v):
        return float(v[1])*float(v[2])/100
    t['RL'] = sum(summer(v) for v in val[:11]) 
    t['RW'] = sum(summer(v) for v in val[12:23]) 
    t['PT'] = sum(summer(v) for v in val[23:32]) 
    t['OJ'] = sum(summer(v) for v in val[32:41]) 
    clean_job_education[key]=t

In [4]:
# Three files were actually formatted the same way.
def get_vals(inputlist,vector):
    x=list()
    for i in inputlist:
        x.append(vector[i])
    return x
def get_conversion(name):
    o=import_csv(name, delimiter="\t")
    new_one = [o[n]+o[n+1] for n in range(1,len(o),2)]
    return [get_vals([0,2,4,11],v) for v in new_one]
 # O*NET number, skill title, importance, level.
def dictify(input_vector,target_jobs=shared_jobs_list):
    base_dict=defaultdict(list)
    for job in sorted(list(target_jobs))[:-1]:
        for point in input_vector:
            if point[0]==job:
                base_dict[job].append(point[1:])
    return base_dict
def convert(x):
    newDict={}
    for job in x.keys():
        tempDict={}
        for entry in x[job]:
            tempDict[entry[0]]=[float(entry[1]),float(entry[2])]
        newDict[job]=tempDict
    return newDict

In [5]:
skills_preliminary=get_conversion("2003ONetData/db_50/Skills.txt")
skills_dict=dictify(skills_preliminary)
Skills_final=convert(skills_dict)

work_activity_preliminary=get_conversion("2003ONetData/db_50/WorkActivity.txt")
work_activity_dict=dictify(work_activity_preliminary)
work_activity_final=convert(work_activity_dict)

knowledge_preliminary = get_conversion("2003ONetData/db_50/Knowledge.txt")
knowledge_dict=dictify(knowledge_preliminary)
knowledge_final=convert(knowledge_dict)

val_base=import_csv("2003ONetData/db_50/WorkValue.txt", delimiter="\t")
val_dict=dictify(val_base)
value_final={a[0]:[get_vals([0,1,3],li) for li in a[1]] for a in val_dict.items()}

In [6]:
market_2015=import_csv("./industry_data/all_2015.csv")
market_2015[2]
def market_to_employment(market_list):
    final=[m[6:] for m in market_list if m[0]=="99" and m[3]=="000000"]
    return final
employment=market_to_employment(market_2015)
cut_shared_jobs_list = [c[:-3] for c in shared_jobs_list if c[-2:]=="00"]
filtered_employment=[m for m in employment if m[0] in cut_shared_jobs_list]
filtered_employment[21]
descriptor=list(enumerate(market_2015[0][6:]))
h_pay_dict={a[0]:a[8] for a in filtered_employment if len(a[8])>2} # Len check ensures numbers not stars.
h_pay_dict

{'11-2011': '54.62',
 '11-2021': '67.63',
 '11-2022': '62.69',
 '11-3011': '45.60',
 '11-3021': '67.79',
 '11-3051': '49.87',
 '11-3061': '54.87',
 '11-9021': '46.88',
 '11-9031': '25.37',
 '11-9033': '49.33',
 '11-9041': '68.10',
 '11-9051': '25.79',
 '11-9061': '40.61',
 '11-9071': '37.39',
 '11-9081': '27.79',
 '11-9111': '50.99',
 '11-9121': '65.66',
 '11-9131': '33.92',
 '11-9141': '32.81',
 '11-9151': '33.38',
 '13-1011': '46.06',
 '13-1021': '29.94',
 '13-1022': '28.50',
 '13-1023': '31.79',
 '13-1032': '31.39',
 '13-1051': '31.16',
 '13-1111': '44.12',
 '13-1121': '24.62',
 '13-2031': '36.13',
 '13-2041': '38.33',
 '13-2051': '45.83',
 '13-2052': '56.76',
 '13-2053': '34.93',
 '13-2061': '42.46',
 '13-2071': '23.70',
 '13-2072': '36.14',
 '13-2081': '27.54',
 '13-2082': '21.50',
 '15-2011': '53.15',
 '15-2021': '54.11',
 '15-2031': '40.47',
 '15-2041': '40.60',
 '15-2091': '25.79',
 '17-1011': '39.83',
 '17-1012': '32.98',
 '17-1021': '31.45',
 '17-1022': '29.75',
 '17-2011': '

In [7]:
def extract_by_key(input_dict_of_dicts,key):
    return {k:input_dict_of_dicts[k][key] for k in input_dict_of_dicts.keys()}
ce=extract_by_key(knowledge_final,"Computers and Electronics")
condensed_ce={k:ce[k][0]*ce[k][1]/25 for k in ce.keys()}
wa=extract_by_key(work_activity_final,"Interacting With Computers")
condensed_iwc={k:wa[k][0]*wa[k][1]/25 for k in wa.keys()}
# Follow up note: Those jobs that end in 9 on the O*net database are always "those not listed separately."
    # Due to methodology, may have shifted more than average?
        # (Matching only the matching, but 9s are defined by the absences, so maybe just never count nines? But need data badly.)
# Consider seeing if there's a systematic difference between the 9s and non-9s?
# Also investigate: 
    # 4.C.2.d.1.i	Spend Time Making Repetitive Motions	How much does this job require making repetitive motions?

In [8]:
final_automation=[[v[0],float(v[3])] for v in doa_figure2]
fa_dict={v[0]:v[1] for v in final_automation}
automation_diff=[[doa_figure1[n][0],float(doa_figure2[n][3])-float(doa_figure1[n][3]),doa_figure2[n][4]] for n in range(len(doa_figure1))]
diff_dict={v[0]:v[1] for v in automation_diff}
def time_difference(initial_date: str,end_date="01/2002"): # Probably returns an int, but might be a float. Those have no common parent, for reasons I know not.
    return int(initial_date[-2:])-int(end_date[-2:])# I don't actually trust any precision closer than that.
diff_time_dict={v[0]:time_difference(v[2]) for v in automation_diff}

In [13]:
assert fa_dict.keys()==condensed_ce.keys()
extraction=sorted(h_pay_dict)
def make_vector(input_dict,keys=extraction):
    return [input_dict[k] for k in keys]
def lin_of_dicts(x_dicts,y_dict):
    return linregress(make_vector(x_dicts),make_vector(y_dict))
#baseline_iwc=make_vector(condensed_iwc)
#baseline_fa=make_vector(fa_dict)
#baseline_ce=make_vector(condensed_ce)
#baseline_diff=make_vector(diff_dict)
#baseline_iwc = [condensed_iwc[e] for e in extraction]
#baseline_fa = [fa_dict[e] for e in extraction]
#linregress(baseline_diff,baseline_fa)
fa_dict_short = {k[:-3]:fa_dict[k] for k in fa_dict.keys()}
def extractor(dict1,dict2):
    inter=[k for k in dict2.keys() if k in set(dict1.keys())]
    return linregress([float(dict1[k]) for k in inter],[float(dict2[k]) for k in inter])
extractor(condensed_iwc,fa_dict)
#sum(baseline_diff)/len(baseline_diff)
#lin_of_dicts(diff_time_dict,diff_dict)
#plt.plot(make_vector(diff_dict),make_vector(fa_dict),'ro')
#plt.plot(make_vector(fa_dict),"ro")
#plt.plot()
#sorted(make_vector(fa_dict))[::-1]
# Change in automation equals change in time multiplied by the sum of b and x, plus the standard epsilon.
extractor(h_pay_dict,fa_dict_short)

#dict1=h_pay_dict
#dict2=fa_dict
#inter=[k[:-3] for k in dict2.keys() if k[:-3] in set(dict1.keys())]

LinregressResult(slope=-0.00054000710224220484, intercept=2.1715399606343726, rvalue=-0.016579458853322043, pvalue=0.71401806226392106, stderr=0.0014727023968364296)

In [None]:
def reg_m(y, x):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    results = sm.OLS(y, X).fit()
    return results
reg_m(baseline_diff,[baseline_iwc,baseline_ce]).summary()

In [None]:
# Note: Investigate "Operation Monitoring" as a potential thing. 