# Let's bring in all the scraped data

In [14]:
%pprint

Pretty printing has been turned OFF


In [15]:
import warnings
warnings.filterwarnings('ignore')

In [16]:
import csv
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
# read in all the job postings

corpusDF = pd.DataFrame()

for f in os.listdir("collected_data/"):
    if (f.endswith(".csv")):
        
        print("Importing... ", f)
        
        filepath = os.path.join("collected_data", f)
        
        df = pd.read_csv(filepath, encoding = "ISO-8859-1")
        
        df["AllDescription"] = df["PositionTitle"] + " | " +  \
                                df["URI"] + " | " + df["DepartmentName"]  + " | " + \
                                df["OrganizationName"] + " | " + f  + " | " + \
                                df["JobSummary"] + " | " + \
                                df["QualificationSummary"]
        
        corpusDF = corpusDF.append(df, ignore_index = True)        
        
        print("Success")
        print("================================================================")

Importing...  aerospace_N_113_20171224133539.csv
Success
Importing...  analysis_N_500_20171223141842.csv
Success
Importing...  analyst_N_239_20171223115544.csv
Success
Importing...  attorney_N_180_20171223141257.csv
Success
Importing...  aviation_N_355_20171224133743.csv
Success
Importing...  businessintelligence_N_20_20171223115447.csv
Success
Importing...  combat_N_400_20171224133814.csv
Success
Importing...  computational_N_17_20171223115812.csv
Success
Importing...  covert_N_4_20171223141634.csv
Success
Importing...  cyber_N_76_20171223141650.csv
Success
Importing...  dataanalyst_N_96_20171223115356.csv
Success
Importing...  dataanalytics_N_19_20171223115427.csv
Success
Importing...  database_N_500_20171223120145.csv
Success
Importing...  dataengineer_N_101_20171223115828.csv
Success
Importing...  datamining_N_9_20171223120312.csv
Success
Importing...  datascience_N_346_20171223115408.csv
Success
Importing...  datascientist_N_28_20171223115414.csv
Success
Importing...  datavisualiz

Success
Importing...  soldier_N_162_20171224133432.csv
Success
Importing...  spatial_N_14_20171223115245.csv
Success
Importing...  SPSS_N_3_20171223120435.csv
Success
Importing...  sql_N_21_20171223120239.csv
Success
Importing...  statistical_N_277_20171223120834.csv
Success
Importing...  statistician_N_8_20171223115854.csv
Success
Importing...  statistics_N_329_20171223115916.csv
Success
Importing...  strategic_N_339_20171224133904.csv
Success
Importing...  surgeon_N_58_20171224133621.csv
Success
Importing...  systems_N_500_20171223120219.csv
Success
Importing...  tableau_N_2_20171223120948.csv
Success
Importing...  tactical_N_142_20171224133851.csv
Success
Importing...  technical_N_500_20171223115709.csv
Success
Importing...  technician_N_500_20171223141315.csv
Success
Importing...  technology_N_500_20171223120810.csv
Success
Importing...  transportation_N_500_20171224133801.csv
Success
Importing...  weaponry_N_17_20171224133630.csv
Success


In [18]:
corpusDF.head(3)

Unnamed: 0,ApplicationCloseDate,DepartmentName,JobGrade,JobSummary,MaxPay,MinPay,OfferingType,OrganizationName,PayType,PositionEndDate,PositionID,PositionLocation,PositionStartDate,PositionTitle,QualificationSummary,URI,AllDescription
0,2018-01-08,National Aeronautics and Space Administration,GS,The Applied Engineering and Technology Directo...,86460.0,66510.0,Permanent,Goddard Space Flight Center,Per Year,2018-01-08,GS18D0058,"Greenbelt, Maryland",2017-12-20,"Aerospace Engineer, AST, Aerospace Vehicle Des...",In addition to the Basic Education Requirement...,https://www.usajobs.gov:443/GetJob/ViewDetails...,"Aerospace Engineer, AST, Aerospace Vehicle Des..."
1,2017-12-28,Department of the Navy,GS,The selectee for this position will serve as a...,72901.0,45970.0,Permanent,"U.S. Atlantic Fleet, Commander in Chief",Per Year,2017-12-28,DE-10078062-18-HB,"Point Mugu, California",2017-12-18,AEROSPACE ENGINEER,"In order to qualify for this position, your re...",https://www.usajobs.gov:443/GetJob/ViewDetails...,AEROSPACE ENGINEER | https://www.usajobs.gov:4...
2,2017-12-30,Department of the Air Force,GS,The mission of the United States Air Force is ...,86460.0,45970.0,Permanent,"Air Force Elements, U.S. Strategic Command",Per Year,2017-12-30,9L-10088723-064321,"Dahlgren, Virginia",2017-12-01,Aerospace Engineer,GS-0861-07 Aerospace Engineer:\r\nA Bachelor&#...,https://www.usajobs.gov:443/GetJob/ViewDetails...,Aerospace Engineer | https://www.usajobs.gov:4...


## Top 10 departments with the highest average paying jobs

In [19]:
set(corpusDF["PayType"])

{'Without Compensation', 'Fee Basis', 'Per Year', 'Per Hour', 'Student Stipend Paid', 'Per Day', 'Bi-weekly'}

In [20]:
print("Total Jobs: ", len(corpusDF))

Total Jobs:  18426


For ease of analysis, I'm only going to look at "Per Year" jobs with the assumption that these salaried positions typically pay higher than the hourly positions. 

In [21]:
salaried = corpusDF[corpusDF["PayType"] == "Per Year"]
print("Total Salaried Jobs: ", len(salaried))

Total Salaried Jobs:  14447


There are also some positions with multiple location placements. I'm going to filter those out. They have a single pipe delimitor in the "PositionLocation" column. 

In [22]:
salaried["hasMultipleLocations"] = salaried["PositionLocation"].apply(lambda row: True if "|" in row else False)
salaried_location = salaried[salaried["hasMultipleLocations"] == False]

In [23]:
salaried_location["MedPay"] = salaried_location[["MinPay", "MaxPay"]].mean(axis = 1)
salaried_location.head(3)

Unnamed: 0,ApplicationCloseDate,DepartmentName,JobGrade,JobSummary,MaxPay,MinPay,OfferingType,OrganizationName,PayType,PositionEndDate,PositionID,PositionLocation,PositionStartDate,PositionTitle,QualificationSummary,URI,AllDescription,PositionLocation2,MedPay
0,2018-01-08,National Aeronautics and Space Administration,GS,The Applied Engineering and Technology Directo...,86460.0,66510.0,Permanent,Goddard Space Flight Center,Per Year,2018-01-08,GS18D0058,"Greenbelt, Maryland",2017-12-20,"Aerospace Engineer, AST, Aerospace Vehicle Des...",In addition to the Basic Education Requirement...,https://www.usajobs.gov:443/GetJob/ViewDetails...,"Aerospace Engineer, AST, Aerospace Vehicle Des...",False,76485.0
1,2017-12-28,Department of the Navy,GS,The selectee for this position will serve as a...,72901.0,45970.0,Permanent,"U.S. Atlantic Fleet, Commander in Chief",Per Year,2017-12-28,DE-10078062-18-HB,"Point Mugu, California",2017-12-18,AEROSPACE ENGINEER,"In order to qualify for this position, your re...",https://www.usajobs.gov:443/GetJob/ViewDetails...,AEROSPACE ENGINEER | https://www.usajobs.gov:4...,False,59435.5
2,2017-12-30,Department of the Air Force,GS,The mission of the United States Air Force is ...,86460.0,45970.0,Permanent,"Air Force Elements, U.S. Strategic Command",Per Year,2017-12-30,9L-10088723-064321,"Dahlgren, Virginia",2017-12-01,Aerospace Engineer,GS-0861-07 Aerospace Engineer:\r\nA Bachelor&#...,https://www.usajobs.gov:443/GetJob/ViewDetails...,Aerospace Engineer | https://www.usajobs.gov:4...,False,66215.0


In [25]:
AvgMinPay = salaried_location.groupby(["DepartmentName"])["MinPay"].mean().to_frame().sort_values("MinPay", ascending = False)
AvgMinPay["MinPay"] = AvgMinPay["MinPay"].round()
AvgMinPay.head(10)

Unnamed: 0_level_0,MinPay
DepartmentName,Unnamed: 1_level_1
Department of Labor,114092.0
Department of Housing And Urban Development,105131.0
Department of Energy,102864.0
Department of the Treasury,100495.0
National Foundation on the Arts and the Humanities,100137.0
National Aeronautics and Space Administration,99635.0
Non-Federal Civilian Customers,94796.0
Department of Transportation,87835.0
Other Agencies and Independent Organizations,82252.0
Judicial Branch,81919.0


In [29]:
AvgMedPay = salaried_location.groupby(["DepartmentName"])["MedPay"].mean().to_frame().sort_values("MedPay", ascending = False)
AvgMedPay["MedPay"] = AvgMedPay["MedPay"].round()
AvgMedPay.head(10)

Unnamed: 0_level_0,MedPay
DepartmentName,Unnamed: 1_level_1
Department of Labor,143023.0
Department of the Treasury,126800.0
Department of Energy,124816.0
Department of Housing And Urban Development,120901.0
Executive Office of the President,120810.0
Non-Federal Civilian Customers,120212.0
National Foundation on the Arts and the Humanities,118098.0
National Aeronautics and Space Administration,117133.0
Department of Transportation,113153.0
Department of Health And Human Services,108637.0


In [31]:
AvgMaxPay = salaried_location.groupby(["DepartmentName"])["MaxPay"].mean().to_frame().sort_values("MaxPay", ascending = False)
AvgMaxPay["MaxPay"] = AvgMaxPay["MaxPay"].round()
AvgMaxPay.head(10)

Unnamed: 0_level_0,MaxPay
DepartmentName,Unnamed: 1_level_1
Department of Labor,171953.0
Executive Office of the President,161900.0
Department of the Treasury,153104.0
Department of Energy,146768.0
Non-Federal Civilian Customers,145629.0
Department of Health And Human Services,140974.0
Department of Transportation,138470.0
Department of Housing And Urban Development,136671.0
National Foundation on the Arts and the Humanities,136059.0
National Aeronautics and Space Administration,134632.0


It looks like the Dept of Labor, the Dept of Treasury, and Dept of Energy are consistently in the top 5 averages for pay. 