In [None]:
import numpy as np
import pandas as pd
import subprocess
import argparse
import matplotlib.pyplot as plt
import matplotlib.ticker as tck
import matplotlib.font_manager
from matplotlib import rc
#rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
#rc('text', usetex=True)

def bins_labels(bins, startValue=0, **kwargs):
    bin_w = (max(bins) - min(bins)) / (len(bins) - 1)
    plt.xticks(np.arange(min(bins)+bin_w/2, max(bins), bin_w), bins, **kwargs)
    plt.xlim(bins[startValue], bins[-1])

In [None]:
interactions = pd.read_csv("../../data/recsys17/interim/interactions.csv", header=0, sep='\t')
# remove interactions of type 'delete'
# remve impressions
interactions = interactions[interactions.interaction_type >= 1].copy()
# remove delete and headhunter event
interactions = interactions[interactions.interaction_type < 4].copy()

In [None]:
#interactions.interaction_type.plot(kind="bar")
interaction_sizes = interactions.groupby("interaction_type").size()
interaction_sizes

In [None]:
%matplotlib inline
f, ax = plt.subplots(1,1,figsize=(15,8))

interaction_sizes.plot(kind="bar", logy=False, ax = ax, color = "skyblue", ec="black")
#ax.set_xticklabels(["CLICK", "MARK", "APPLY"], rotation='horizontal')

font = 10

#for tick in ax.xaxis.get_major_ticks():
#    tick.label.set_fontsize(12) 
#for tick in ax.yaxis.get_major_ticks():
#    tick.label.set_fontsize(12) 
    
ax.get_yaxis().set_major_formatter(
    tck.FuncFormatter(lambda x, p: format(int(x), ',')))

ax.set_title('RecSys 2017', fontsize = font)
ax.set_xlabel('Interaction Type', fontsize = font)
ax.set_ylabel('Num Interactions', fontsize = font)
f.tight_layout()
f.savefig("../../plots/recsys17_interactions.pdf", dpi=300, bbox_inches='tight')

In [None]:
%matplotlib inline
f, ax = plt.subplots(1,1,figsize=(15,8))

session_sizes = interactions.groupby("session_id").size()
print(session_sizes.max())
session_sizes.plot(kind="hist", logy=True, ax = ax, bins=25, color = "skyblue", ec="black")

font = 35

for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(25) 
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(25) 

ax.set_title('RecSys 2017', fontsize = font)
ax.set_xlabel('Interactions in session', fontsize = font)
ax.set_ylabel('\# Sessions', fontsize = font)
ax.xaxis.grid(True, which='major', linestyle='-', linewidth=0.35)
ax.yaxis.grid(True, which='major', linestyle='-', linewidth=0.35)
plt.minorticks_on()
f.tight_layout()
f.savefig("../../plots/recsys17_session_sizes.pdf", dpi=300, bbox_inches='tight')

In [None]:
%matplotlib inline
f, ax = plt.subplots(1,1,figsize=(10,5))

sessions_with_applies = interactions[interactions.interaction_type == 3]

session_apply_sizes = sessions_with_applies.groupby("session_id").size()

bins = range(11)
session_apply_sizes.plot(kind="hist", logy=True, ax = ax, bins=bins, color = "skyblue", ec="black")
bins_labels(bins, 1, fontsize=14)

font = 20

for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(14) 

ax.set_title('RecSys 2017: Job application events', fontsize = font)
ax.set_xlabel('Number of apply events within a session', fontsize = font)
ax.set_ylabel('\# Sessions', fontsize = font)
ax.xaxis.grid(True, which='major', linestyle='-', linewidth=0.35)
ax.yaxis.grid(True, which='major', linestyle='-', linewidth=0.35)
f.tight_layout()
f.savefig("../../plots/recsys17_session_apply_events.pdf", dpi=300)

In [None]:
%matplotlib inline
train = pd.read_csv("../../data/recsys17/processed/train_14d.csv", sep='\t')
train_sessions = train.groupby("session_id")

remind_counts = []

for  sid, sgroup in train_sessions:
    vc = sgroup["item_id"].value_counts()
    vc = vc[vc > 1]
    if len(vc) >= 1:
        remind_counts.append(len(vc))
        
df = pd.DataFrame(remind_counts, columns=["remind_counts"])

In [None]:
df.plot(kind="hist")
print(len(train_sessions))
print(len(df))
df.median()

In [None]:
sessions = interactions.groupby("session_id")

small_sessions = sessions.filter(lambda x: len(x) < 3)
medium_sessions = sessions.filter(lambda x: len(x) == 3)
big_sessions = sessions.filter(lambda x: len(x) >= 3)

print(len(small_sessions))
print(len(medium_sessions))
print(len(big_sessions))

%matplotlib inline
f, ax = plt.subplots(1,1,figsize=(15,8))


bins = range(41)

session_sizes = sessions.size()
print(session_sizes.max())
session_sizes.plot(kind="hist", logy=True, ax = ax, bins=bins, color = "skyblue", ec="black")
bins_labels(bins, 1, fontsize=14)

In [None]:
small_interaction_sizes = small_sessions.groupby("interaction_type").size()
medium_interaction_sizes = medium_sessions.groupby("interaction_type").size()
big_interaction_sizes = big_sessions.groupby("interaction_type").size()

In [None]:
def plot_sizes(df):
    f, ax = plt.subplots(1,1,figsize=(15,8))

    df.plot(kind="bar", logy=False, ax = ax, color = "skyblue", ec="black")

    font = 10


    ax.get_yaxis().set_major_formatter(
        tck.FuncFormatter(lambda x, p: format(int(x), ',')))

    ax.set_title('RecSys 2017', fontsize = font)
    ax.set_xlabel('Interaction Type', fontsize = font)
    ax.set_ylabel('Num Interactions', fontsize = font)
    f.tight_layout()

In [None]:
plot_sizes(small_interaction_sizes)

In [None]:
plot_sizes(medium_interaction_sizes)

In [None]:
plot_sizes(big_interaction_sizes)

In [None]:
small_items = small_sessions.item_id.unique()
medium_items = big_sessions.item_id.unique()

setA = set(small_items)
setB = set(medium_items)

overlap = setA & setB
universe = setA | setB

result1 = float(len(overlap)) / len(setA) * 100
result2 = float(len(overlap)) / len(setB) * 100
result3 = float(len(overlap)) / len(universe) * 100

print(result1)
print(result2)
print(result3)

In [None]:
for key, value in small_items.groupby("session_id"):
    print(set(value.item_id.unique()).size())

In [None]:
small_durations = small_sessions.groupby('session_id')['created_at'].agg(np.ptp)
medium_durations = medium_sessions.groupby('session_id')['created_at'].agg(np.ptp)
big_durations = big_sessions.groupby('session_id')['created_at'].agg(np.ptp)

In [None]:
import seaborn as sns
sns.set(style="whitegrid")

sns.boxplot(x=small_durations, showfliers=False)
small_durations.head()

In [None]:
import seaborn as sns
sns.set(style="whitegrid")

sns.boxplot(x=medium_durations, showfliers=False)
small_durations.head()

In [None]:
import seaborn as sns
sns.set(style="whitegrid")

sns.boxplot(x=big_durations, showfliers=False)
big_durations.head()

In [None]:
tiny_durations = [i for i in big_durations if i < 100]

print(len(tiny_durations) / len(big_durations))
print(len(tiny_durations))


In [None]:
tiny_durations = [i for i in medium_durations if i < 100]

print(len(tiny_durations) / len(medium_durations))
print(len(tiny_durations))


In [None]:
tiny_durations = [i for i in small_durations if i < 100]


print(len(tiny_durations) / len(small_durations))
print(len(tiny_durations))


In [None]:
def plot_hist(df):
    f, ax = plt.subplots(1,1,figsize=(15,8))

    df.plot(kind="hist", logy=True, logx=False, ax = ax, bins=30, color = "skyblue", ec="black")

    font = 10


    ax.get_yaxis().set_major_formatter(
        tck.FuncFormatter(lambda x, p: format(int(x), ',')))

    ax.set_title('RecSys 2017', fontsize = font)
    ax.set_xlabel('Session Duration (ms)', fontsize = font)
    ax.set_ylabel('# Sessions', fontsize = font)
    ax.set_ylim(1,60000)
    ax.set_xlim(-100,7500)

    f.tight_layout()

In [None]:
plot_hist(small_durations)

In [None]:
plot_hist(medium_durations)

In [None]:
plot_hist(big_durations)