In [None]:
import pandas as pd
from datetime import datetime, timedelta
import os
import numpy as np
from natsort import index_natsorted, order_by_index
import math

### 1. Import data

In [None]:
df_jams = pd.read_csv('../dataset/jams.csv')
df_jam_desc = pd.read_csv('../dataset/jam_desc.csv')

### 2. Clean data and preprocess features

In [None]:
# get jams with duration > 1 hour and truncate top 1% jams
date_pattern = "%Y-%m-%d %H:%M:%S" # 2016-11-14 12:00:00
df_jams["jam_duration"] = (pd.to_datetime(df_jams['jam_end_date'], format=date_pattern) - \
                           pd.to_datetime(df_jams['jam_start_date'], format=date_pattern)) / \
                           timedelta(hours = 1)
df_jams = df_jams[df_jams['jam_duration'] > 1] # only get jams lasting more than 1 hour
df_jams.sort_values(by=['jam_duration'], 
                    ascending=True, 
                    inplace=True)
df_jams = df_jams.head(int(len(df_jams)*(99/100))) # remove top 1% in length

In [None]:
# join jam and jam description
df_jam_all = pd.merge(df_jams, df_jam_desc, on='jam_url', how='inner')

In [None]:
# Calculate number of hosts
df_jam_all["num_hosts"] = df_jam_all["jam_host"].map(lambda a: len(a.split("||")))

In [None]:
# Separate into competitive jam and non-competitive jam
competitive_jams = df_jam_all[df_jam_all['jam_criteria'].notnull()]
non_competitive_jams = df_jam_all[df_jam_all['jam_criteria'].isnull()]

In [None]:
# Calculate number of criteria for competitive jam
competitive_jams["num_criteria"] = competitive_jams["jam_criteria"].map(lambda a: len(a.split("||")))