In [None]:
import numpy as np
import pandas as pd
import subprocess
import argparse
import matplotlib.pyplot as plt
import matplotlib.ticker as tck
from matplotlib import rc
rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
rc('text', usetex=True)

def bins_labels(bins, startValue=0, **kwargs):
    bin_w = (max(bins) - min(bins)) / (len(bins) - 1)
    plt.xticks(np.arange(min(bins)+bin_w/2, max(bins), bin_w), bins, **kwargs)
    plt.xlim(bins[startValue], bins[-1])

In [None]:
jobs = pd.read_csv("../../data/recsys17/raw/items.csv", header=0, sep='\t')
jobs.head()

In [None]:
interactions = pd.read_csv("../../data/recsys17/interim/interactions.csv", header=0, sep='\t')
# remove interactions of type 'delete'
# remve impressions
interactions = interactions[interactions.interaction_type >= 1].copy()
# remove delete and headhunter event
interactions = interactions[interactions.interaction_type < 4].copy()
interactions.head(3)

# Keep only jobs which were interacted with

In [None]:
content_fields = ["career_level", "discipline_id", "industry_id", "country", "is_payed", "region", "employment", "tags"]
common = jobs.merge(interactions, on=['item_id'])[content_fields].drop_duplicates()
print(len(common))
print(len(interactions))
print(len(jobs))
common.head()

In [None]:
common['career_level'] = common['career_level'].astype(str)
common['discipline_id'] = common['discipline_id'].astype(str)
common['industry_id'] = common['industry_id'].astype(str)
common['country'] = common['country'].astype(str)
common['is_payed'] = common['is_payed'].astype(str)
common['region'] = common['region'].astype(str)
common['employment'] = common['employment'].astype(str)

common.describe(include = 'all')

In [None]:
%matplotlib inline
def plot_bar(df, column):
    f, ax = plt.subplots(1,1,figsize=(15,8))

    df[column] = df[column].str.replace('_',' ')
    
    df[column].value_counts().plot(kind="bar", ax = ax, color = "skyblue", ec="black")
    ax.tick_params(axis='x', rotation=0)
    
    font = 50
    font_tick = 30

    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(font_tick) 
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(font_tick) 
    
    ax.get_yaxis().set_major_formatter(
        tck.FuncFormatter(lambda x, p: format(int(x), ',')))

    col_str = column.replace("_", " ").title()
    ax.set_xlabel(col_str, fontsize = font)
    ax.set_ylabel('\# Count', fontsize = font)
    f.tight_layout()
    f.savefig("../../plots/recsys17_dist_" + column + ".pdf", dpi=300, bbox_inches='tight')

In [None]:
plot_bar(common, "discipline_id")

In [None]:
plot_bar(common, "career_level")

In [None]:
plot_bar(common, "industry_id")

In [None]:
plot_bar(common, "country")

In [None]:
plot_bar(common, "is_payed")


In [None]:
plot_bar(common, "region")

In [None]:
plot_bar(common, "employment")


In [None]:
plot_bar(common, "tags")