In [20]:
import csv
from collections import defaultdict as ddict

wd = "data/job_offers/" # The directory that the data files are in

print("Recording job locations...")
job_info = {}
data = list()
with open(wd + "jobs.tsv", "r") as infile:
    reader = csv.reader(infile, delimiter="\t", 
    quoting=csv.QUOTE_NONE, quotechar="")
    next(reader) # burn the header
    for line in reader:
        (Jobid, WindowId, Title, Description, Requirements, City, State, 
        Country, Zip5, StartDate, EndDate) = line
        data.append({"Jobid":Jobid, "WindowId": WindowId, "Title": Title, 
                     "Description": Description, 
                     "Requirements": Requirements, 
                     "City":City, 
                     "State": State, 
        "Country": Country, "Zip5": Zip5, "StartDate": StartDate, "EndDate": EndDate})
        job_info[str(Jobid)] = [int(WindowId), State, City, 0]
        # The terminal zero is for an application count

print ("Counting applications...")
with open(wd + "apps.tsv") as infile:
    reader = csv.reader(infile, delimiter="\t")
    next(reader)  # burn the header
    for line in reader:
        (UserId, WindowID, Split, ApplicationDate, JobId) = line
        job_info[JobId][3] += 1

print ("Sorting jobs on based on popularity...")
top_city_jobs = ddict(lambda: ddict(lambda: ddict(list)))
top_state_jobs = ddict(lambda: ddict(list))
for (job_id, (window, State, City, count)) in job_info.items():
    top_city_jobs[window][State][City].append((job_id, count))
    top_state_jobs[window][State].append((job_id, count))
for window in [1, 2, 3, 4, 5, 6, 7]:
    for state in top_city_jobs[window]:
        for city in top_city_jobs[window][state]:
            top_city_jobs[window][state][city].sort(key=lambda x: x[1])
            top_city_jobs[window][state][city].reverse()
    for state in top_state_jobs[window]:
        top_state_jobs[window][state].sort(key=lambda x: x[1])
        top_state_jobs[window][state].reverse()

print( "Making predictions...")
with open(wd + "users.tsv", "r") as infile:
    reader = csv.reader(infile, delimiter="\t", 
    quoting=csv.QUOTE_NONE, quotechar="")
    next(reader)  # burn the header
    with open(wd + "popular_jobs.csv", "w") as outfile:
        outfile.write("UserId, JobIds\n")
        for line in reader:
            (UserId, WindowId, Split, City, State, Country, ZipCode,
            DegreeType, Major, GraduationDate, WorkHistoryCount,
            TotalYearsExperience, CurrentlyEmployed, ManagedOthers,
            ManagedHowMany) = line
            if Split == "Train":
                continue
            top_jobs = top_city_jobs[int(WindowId)][State][City]
            if len(top_jobs) < 150:
                top_jobs += top_state_jobs[int(WindowId)][State]
            top_jobs = top_jobs[0:150]
            outfile.write(str(UserId) + "," + " ".join([x[0] for x in top_jobs]) + "\n")

Recording job locations...
Counting applications...
Sorting jobs on based on popularity...
Making predictions...


In [24]:
data[0]["Title"]

'Security Engineer/Technical Lead'

In [25]:
import pandas as pd

In [26]:
df = pd.DataFrame(data)

In [29]:
df.head(20)


Unnamed: 0,Jobid,WindowId,Title,Description,Requirements,City,State,Country,Zip5,StartDate,EndDate
0,1,1,Security Engineer/Technical Lead,<p>Security Clearance Required:&nbsp; Top Secr...,<p>SKILL SET</p>\r<p>&nbsp;</p>\r<p>Network Se...,Washington,DC,US,20531.0,2012-03-07 13:17:01.643,2012-04-06 23:59:59
1,4,1,SAP Business Analyst / WM,<strong>NO Corp. to Corp resumes&nbsp;are bein...,<p><b>WHAT YOU NEED: </b></p>\r<p>Four year co...,Charlotte,NC,US,28217.0,2012-03-21 02:03:44.137,2012-04-20 23:59:59
2,7,1,P/T HUMAN RESOURCES ASSISTANT,<b> <b> P/T HUMAN RESOURCES ASSISTANT</b> <...,Please refer to the Job Description to view th...,Winter Park,FL,US,32792.0,2012-03-02 16:36:55.447,2012-04-01 23:59:59
3,8,1,Route Delivery Drivers,CITY BEVERAGES Come to work for the best in th...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:10.077,2012-04-02 23:59:59
4,9,1,Housekeeping,I make sure every part of their day is magica...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:11.88,2012-04-02 23:59:59
5,10,1,SALON/SPA COORDINATOR,<b> <b>— SALON/SPA COORDINATOR ...,Please refer to the Job Description to view th...,Ormond Beach,FL,US,32174.0,2012-03-05 14:21:50.203,2012-04-04 23:59:59
6,11,1,SUPERINTENDENT,<b> <b>SUPERINTENDENT</b> </b> —— Central...,Please refer to the Job Description to view th...,Orlando,FL,US,32801.0,2012-03-06 09:21:54.58,2012-04-05 23:59:59
7,12,1,ELECTRONIC PRE-PRESS PROFESSIONAL,<b> <b>ELECTRONIC PRE-PRESS PROFESSIONAL</b...,Please refer to the Job Description to view th...,Orlando,FL,US,32808.0,2012-03-06 11:21:53.63,2012-04-05 23:59:59
8,13,1,UTILITY LINE TRUCK OPERATOR/ DIGGER DERRICK,<b> </b> \r\n <b> <b>UTILITY LINE TRUCK...,Please refer to the Job Description to view th...,Orlando,FL,US,32801.0,2012-03-06 16:06:53.677,2012-04-05 23:59:59
9,14,1,CONSTRUCTION PROJECT MGR & PM TRAINEE,<b> <b>CONSTRUCTION PROJECT MGR </b> </b> \...,Please refer to the Job Description to view th...,Winter Park,FL,US,32789.0,2012-03-07 10:21:37.467,2012-04-06 23:59:59
