In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
from tqdm import tqdm

from docplex.mp.model import Model
from docplex.mp.environment import Environment

In [2]:
def cost_function(prediction):
    days = list(range(N_DAYS,0,-1))

    penalty_all = 0
    pref_cost = []

    # We'll use this to count the number of people scheduled each day
    daily_occupancy = {k:0 for k in days}
    
    # Looping over each family; d is the day for each family f
    for f, d in enumerate(prediction):

        # Using our lookup dictionaries to make simpler variable names
        n = family.n_people[f]
        choice_0 = family['choice_0'][f]
        choice_1 = family['choice_1'][f]
        choice_2 = family['choice_2'][f]
        choice_3 = family['choice_3'][f]
        choice_4 = family['choice_4'][f]
        choice_5 = family['choice_5'][f]
        choice_6 = family['choice_6'][f]
        choice_7 = family['choice_7'][f]
        choice_8 = family['choice_8'][f]
        choice_9 = family['choice_9'][f]

        # add the family member count to the daily occupancy
        daily_occupancy[d] += n

        # Calculate the penalty for not getting top preference
        penalty = 0
        if d == choice_0:
            penalty += 0
        elif d == choice_1:
            penalty += 50
        elif d == choice_2:
            penalty += 50 + 9 * n
        elif d == choice_3:
            penalty += 100 + 9 * n
        elif d == choice_4:
            penalty += 200 + 9 * n
        elif d == choice_5:
            penalty += 200 + 18 * n
        elif d == choice_6:
            penalty += 300 + 18 * n
        elif d == choice_7:
            penalty += 300 + 36 * n
        elif d == choice_8:
            penalty += 400 + 36 * n
        elif d == choice_9:
            penalty += 500 + 36 * n + 199 * n
        else:
            penalty += 500 + 36 * n + 398 * n
        penalty_all += penalty
        pref_cost.append(penalty)

    penalty = penalty_all
    # for each date, check total occupancy
    #  (using soft constraints instead of hard constraints)
    for _, v in daily_occupancy.items():
        if (v > MAX_OCCUPANCY) or (v < MIN_OCCUPANCY):
            penalty += 100000000

    # Calculate the accounting cost
    # The first day (day 100) is treated special
    accounting_cost = (daily_occupancy[days[0]]-125.0) / 400.0 * daily_occupancy[days[0]]**(0.5)
    # using the max function because the soft constraints might allow occupancy to dip below 125
    accounting_cost = max(0, accounting_cost)
    
    # Loop over the rest of the days, keeping track of previous count
    yesterday_count = daily_occupancy[days[0]]
    for day in days[1:]:
        today_count = daily_occupancy[day]
        diff = abs(today_count - yesterday_count)
        accounting_cost += max(0, (daily_occupancy[day]-125.0) / 400.0 * daily_occupancy[day]**(0.5 + diff / 50.0))
        yesterday_count = today_count
    
    penalty += accounting_cost

    return penalty, daily_occupancy, accounting_cost, pref_cost

In [3]:
family = pd.read_csv('../input/family_data.csv')
family.head()

Unnamed: 0,family_id,choice_0,choice_1,choice_2,choice_3,choice_4,choice_5,choice_6,choice_7,choice_8,choice_9,n_people
0,0,52,38,12,82,33,75,64,76,10,28,4
1,1,26,4,82,5,11,47,38,6,66,61,4
2,2,100,54,25,12,27,82,10,89,80,33,3
3,3,2,95,1,96,32,6,40,31,9,59,2
4,4,53,1,47,93,26,3,46,16,42,39,4


In [4]:
sample = pd.read_csv('../input/sample_submission.csv')

In [5]:
num_family = family.shape[0]
num_days = 100
families = range(num_family)
days = range(1, 101)
MAX_OCCUPANCY = 300
MIN_OCCUPANCY = 125
attendances = range(MIN_OCCUPANCY, MAX_OCCUPANCY+1)
#MAX_ATTENDANCE_DELTA = 29
MAX_DAY_COST = 68888
#MAX_CHOICE = 10
N_DAYS = 100

In [6]:
accounting_cost = np.zeros((301, 301))
for i in attendances:
    for j in attendances:
        accounting_cost[i, j] = ((i - 125) / 400) * np.power(i, 0.5 + np.abs(i - j) / 50)

In [7]:
last_day_cost = np.zeros((301,))
for i in attendances:
    last_day_cost[i] = ((i - 125) / 400) * np.power(i, 0.5)

In [8]:
env = Environment()
env.print_information()

* system is: Linux 64bit
* Python version 3.7.3, located at: /home/jfpuget/anaconda3/bin/python
* docplex is present, version is (2, 11, 176)
* CPLEX library is present, version is 12.10.0.0, located at: /home/jfpuget/anaconda3/lib/python3.7/site-packages
* pandas is present, version is 0.24.2


In [9]:
mdl = Model("santa")

In [10]:
assign_family_to_day_vars = mdl.binary_var_matrix(families, days, lambda c: '%d_to_%d' % c)

In [11]:
mdl.add_constraints(
    mdl.sum(assign_family_to_day_vars[f, d] * family.n_people[f] for f in families) <= MAX_OCCUPANCY
    for d in days)
mdl.print_information()

Model: santa
 - number of variables: 500000
   - binary=500000, integer=0, continuous=0
 - number of constraints: 100
   - linear=100
 - parameters: defaults
 - problem type is: MILP


In [12]:
mdl.add_constraints(
    mdl.sum(assign_family_to_day_vars[f, d] * family.n_people[f] for f in families) >= MIN_OCCUPANCY
    for d in days)
mdl.print_information()

Model: santa
 - number of variables: 500000
   - binary=500000, integer=0, continuous=0
 - number of constraints: 200
   - linear=200
 - parameters: defaults
 - problem type is: MILP


In [13]:
mdl.add_constraints(
    mdl.sum(assign_family_to_day_vars[f, d] for d in days) == 1
    for f in families)
mdl.print_information()

Model: santa
 - number of variables: 500000
   - binary=500000, integer=0, continuous=0
 - number of constraints: 5200
   - linear=5200
 - parameters: defaults
 - problem type is: MILP


In [14]:
preference_cost = np.load('../input/preference_cost.npy')

In [15]:
day_attendance_bit = mdl.binary_var_matrix(days, attendances, lambda c: 'day_%d_had_%d' % c)
day_attendance_prev = mdl.binary_var_cube(days[:-1], attendances, attendances, #0, 1, 
                                              lambda c: 'day_%d_had_%d_prev_%d' % c)

In [16]:
mdl.add_constraints(
    mdl.sum(day_attendance_bit[d, a] for a in attendances) == 1
    for d in days)
mdl.print_information()

Model: santa
 - number of variables: 3584224
   - binary=3584224, integer=0, continuous=0
 - number of constraints: 5300
   - linear=5300
 - parameters: defaults
 - problem type is: MILP


In [17]:
mdl.add_constraints(
    mdl.sum(assign_family_to_day_vars[f, d] * family.n_people[f] for f in families) == 
    mdl.sum(a * day_attendance_bit[d, a] for a in attendances)
    for d in days)
mdl.print_information()

Model: santa
 - number of variables: 3584224
   - binary=3584224, integer=0, continuous=0
 - number of constraints: 5400
   - linear=5400
 - parameters: defaults
 - problem type is: MILP


In [18]:
mdl.add_constraints(
    day_attendance_bit[d, a_d] == 
    mdl.sum(day_attendance_prev[d, a_d, a_d1] for a_d1 in attendances)
    for a_d in attendances 
    for d in days[:-1])
mdl.print_information()

Model: santa
 - number of variables: 3584224
   - binary=3584224, integer=0, continuous=0
 - number of constraints: 22824
   - linear=22824
 - parameters: defaults
 - problem type is: MILP


In [19]:
mdl.add_constraints(
    day_attendance_bit[d+1, a_d1] == 
    mdl.sum(day_attendance_prev[d, a_d, a_d1] for a_d in attendances)
    for a_d1 in attendances 
    for d in days[:-1])
mdl.print_information()

Model: santa
 - number of variables: 3584224
   - binary=3584224, integer=0, continuous=0
 - number of constraints: 40248
   - linear=40248
 - parameters: defaults
 - problem type is: MILP


In [20]:
accounting_max = mdl.add_constraints(day_attendance_prev[d, a_d, a_d1] == 0 
                             for d in tqdm(days[:-1])
                             for a_d in attendances
                             for a_d1 in attendances
                             if accounting_cost[a_d, a_d1] > MAX_DAY_COST
                    )                    

mdl.print_information()

100%|██████████| 99/99 [00:07<00:00, 12.68it/s]


Model: santa
 - number of variables: 3584224
   - binary=3584224, integer=0, continuous=0
 - number of constraints: 643554
   - linear=643554
 - parameters: defaults
 - problem type is: MILP


In [21]:
last_day_max = mdl.add_constraints(day_attendance_bit[d, a_d] == 0 
                                   for a_d in attendances
                                   for d in days
                                   if last_day_cost[a_d] > MAX_DAY_COST
                    )                    

mdl.print_information()

Model: santa
 - number of variables: 3584224
   - binary=3584224, integer=0, continuous=0
 - number of constraints: 643554
   - linear=643554
 - parameters: defaults
 - problem type is: MILP


In [22]:
if 1:
    accounting_penalty = mdl.sum(accounting_cost[a_d, a_d1] * day_attendance_prev[d, a_d, a_d1]
                             for d in tqdm(days[:-1])
                             for a_d in attendances
                             for a_d1 in attendances
                             if accounting_cost[a_d, a_d1] <= MAX_DAY_COST
                            )

    last_day_penalty = mdl.sum(last_day_cost[a_d] * day_attendance_bit[d, a_d]
                               for d in days[-1:]
                               for a_d in attendances
                               if last_day_cost[a_d] <= MAX_DAY_COST
                            )
    accounting_penalty = accounting_penalty +  last_day_penalty      

else:
    day_cost = mdl.continuous_var_dict(days, name='day_cost', lb=0, #ub=MAX_DAY_COST
                                  )

    mdl.add_constraints(day_cost[d] >= mdl.sum(accounting_cost[a_d, a_d1] * day_attendance_prev[d, a_d, a_d1]
                                         for a_d in attendances
                                         for a_d1 in attendances
                                         if accounting_cost[a_d, a_d1] <= MAX_DAY_COST
                                          )
                             for d in tqdm(days[:-1])
                            )

                    
    for dd in days[-1:]:
        mdl.add(day_cost[dd] >= mdl.sum(last_day_cost[a_d] * day_attendance_bit[d, a_d]
                               for d in days[-1:]
                                 for a_d in attendances
                                 if last_day_cost[a_d] <= MAX_DAY_COST)
                            )

    accounting_penalty = mdl.sum(day_cost[d] for d in days)

100%|██████████| 99/99 [00:10<00:00,  9.86it/s]


In [23]:
mdl.parameters.timelimit = 1000  # nurse should not take more than that !


In [24]:
sub = pd.read_csv('../submissions/submission_68888.04343194816.csv')

In [25]:
# checking sub

sub_cst = mdl.add_constraints(assign_family_to_day_vars[f, a] == 1 for f,a in zip(sub.family_id, sub.assigned_day))

In [26]:
preference_penalty = mdl.sum(assign_family_to_day_vars[f, d] * preference_cost[f, d] 
                             for f in families for d in days)

In [27]:
# Set objective function
mdl.minimize(preference_penalty + accounting_penalty)

mdl.print_information()

mdl.parameters.mip.tolerances.mipgap = 0.00
mdl.parameters.timelimit = 50000 
mdl.parameters.threads = 20

mdl.solve(log_output=True)
mdl.report()

preds = []
for (f, d) in sorted(assign_family_to_day_vars):
    if assign_family_to_day_vars[(f, d)].solution_value == 1:
        #print("Family %03d attends on day %03d" % (f, d))
        preds.append(d)

cost, _, _, _ = cost_function(preds)
cost

Model: santa
 - number of variables: 3584224
   - binary=3584224, integer=0, continuous=0
 - number of constraints: 648554
   - linear=648554
 - parameters:
     parameters.timelimit = 1000.00000000000000
 - problem type is: MILP
Version identifier: 12.10.0.0 | 2019-11-26 | 843d4de
CPXPARAM_Read_DataCheck                          1
CPXPARAM_Threads                                 20
CPXPARAM_RandomSeed                              201903125
CPXPARAM_TimeLimit                               50000
CPXPARAM_MIP_Tolerances_MIPGap                   0
Tried aggregator 1 time.
MIP Presolve eliminated 648554 rows and 3584224 columns.
MIP Presolve modified 1699 coefficients.
All rows and columns eliminated.
Presolve time = 4.59 sec. (3657.06 ticks)

Root node processing (before b&c):
  Real time             =    5.43 sec. (4108.19 ticks)
Parallel b&c, 20 threads:
  Real time             =    0.00 sec. (0.00 ticks)
  Sync time (average)   =    0.00 sec.
  Wait time (average)   =    0.00 sec.
    

68888.04343194816

In [28]:
mdl.remove(sub_cst)

In [29]:
from docplex.mp.progress import SolutionRecorder

class MyProgressListener(SolutionRecorder):
    def __init__(self, model):
        SolutionRecorder.__init__(self)
        self.solutions = []
        self.current_objective = 999999;
    def notify_solution(self, s):
        SolutionRecorder.notify_solution(self, s)
        self.solutions.append(s)
        if self.current_progress_data.current_objective >= self.current_objective:
            return;
        self.current_objective = self.current_progress_data.current_objective;
        print ('Intermediate Solution')
        # solution = pd.DataFrame(data=[[f, d] for f in families for d in days if s.get_value(assign_day[f, d]) == 1],
        #                         columns=['family_id', 'assigned_day'])
        preds = []
        for (f, d) in sorted(assign_family_to_day_vars):
            if s.get_value(assign_family_to_day_vars[(f, d)]) == 1:
                preds.append(d)
        solution = pd.DataFrame(data=[f for f in families], columns = ['family_id'])
        solution['assigned_day'] = preds

        score, _, _, _ = cost_function(preds)
        print('Score: ' + str(score))
        solution.to_csv('../submissions/submission_' + str(score) + '.csv', index=False, line_terminator='\n', encoding='utf-8')
    def get_solutions(self):
        return self.solutions

In [30]:
listener = MyProgressListener(mdl)
mdl.add_progress_listener(listener)

In [31]:
mdl.add(preference_penalty + accounting_penalty <= 68888.05)
mdl.print_information()

Model: santa
 - number of variables: 3584224
   - binary=3584224, integer=0, continuous=0
 - number of constraints: 643555
   - linear=643555
 - parameters:
     parameters.threads = 20
     parameters.timelimit = 50000.00000000000000
     parameters.mip.tolerances.mipgap = 0.00000000000000
 - problem type is: MILP


In [32]:
mdl.parameters.mip.tolerances.mipgap = 0
mdl.parameters.timelimit = 1000000 
mdl.parameters.threads = 20
mdl.parameters.emphasis.mip = 2

mdl.solve(log_output=True)
mdl.report()

preds = []
for (f, d) in sorted(assign_family_to_day_vars):
    if assign_family_to_day_vars[(f, d)].solution_value == 1:
        #print("Family %03d attends on day %03d" % (f, d))
        preds.append(d)

cost, _, _, _ = cost_function(preds)
cost

Version identifier: 12.10.0.0 | 2019-11-26 | 843d4de
CPXPARAM_Read_DataCheck                          1
CPXPARAM_Threads                                 20
CPXPARAM_RandomSeed                              201903125
CPXPARAM_Emphasis_MIP                            2
CPXPARAM_TimeLimit                               1000000
CPXPARAM_MIP_Tolerances_MIPGap                   0
1 of 1 MIP starts provided solutions.
MIP start 'm1' defined initial solution with objective 68888.0434.
Tried aggregator 1 time.
Presolve has eliminated 603306 rows and 603306 columns...
MIP Presolve eliminated 603306 rows and 603306 columns.
Reduced MIP has 40249 rows, 2980918 columns, and 9937753 nonzeros.
Reduced MIP has 2980918 binaries, 0 generals, 0 SOSs, and 0 indicators.
Presolve time = 17.32 sec. (12067.95 ticks)
Tried aggregator 1 time.
Presolve has eliminated 0 rows and 0 columns...
Detecting symmetries...
Reduced MIP has 40249 rows, 2980918 columns, and 9937753 nonzeros.
Reduced MIP has 2980918 binaries, 0

Elapsed time = 2258.61 sec. (1488977.13 ticks, tree = 0.46 MB, solutions = 1)
     42    28    68766.9484  1470    68888.0434    68753.2350   137864    0.20%
     43    18    68765.4849  1408    68888.0434    68753.2350   108518    0.20%
     44    27    68769.8384  1406    68888.0434    68753.2350   135546    0.20%
     45    29    68767.1548  1574    68888.0434    68753.2350   142803    0.20%
     46    33    68769.7129  1444    68888.0434    68753.2350   155608    0.20%
     47    30    68768.9111  1396    68888.0434    68753.2350   147289    0.20%
     49    21    68765.7014  1390    68888.0434    68753.2350   117776    0.20%
     51    34    68769.2352  1483    68888.0434    68753.2350   158031    0.20%
     53    36    68770.1420  1439    68888.0434    68753.2350   163598    0.20%
     56    38    68770.8482  1582    68888.0434    68755.4123   169605    0.19%
Elapsed time = 2399.44 sec. (1534709.79 ticks, tree = 1.36 MB, solutions = 1)
     58    55    68770.4348  1560    68888.0

   1353   970    68854.7798   789    68888.0434    68755.4123   919538    0.19%
   1365  1024    68863.5140   750    68888.0434    68755.4123   970341    0.19%
   1380  1015    68873.3417   608    68888.0434    68755.4123   964321    0.19%
   1408   985    68874.7429   577    68888.0434    68755.4123   925274    0.19%
   1436   998    68877.7445   475    68888.0434    68755.4123   926389    0.19%
   1455  1148    68880.0558   485    68888.0434    68756.4400  1016177    0.19%
   1472  1104    68780.1076  1472    68888.0434    68756.4400   999677    0.19%
Elapsed time = 3133.86 sec. (1728536.17 ticks, tree = 355.15 MB, solutions = 1)
   1496  1066    68786.7599  1334    68888.0434    68758.8462   988573    0.19%
   1515  1194    68854.4241   729    68888.0434    68758.8462  1062295    0.19%
   1532  1107    68785.9711  1421    68888.0434    68758.8462  1002168    0.19%
   1545  1155    68860.4994   735    68888.0434    68758.8462  1028560    0.19%
   1573  1163        cutoff          688

   4057  3188    68786.1753  1707    68888.0434    68769.8989  2474993    0.17%
   4075  3277    68778.2529  1662    68888.0434    68769.8989  2518779    0.17%
   4098  3149    68783.6134  1704    68888.0434    68769.8989  2439303    0.17%
Elapsed time = 4466.34 sec. (2084945.48 ticks, tree = 1284.27 MB, solutions = 1)
   4120  3359    68792.9361  1529    68888.0434    68769.8989  2582833    0.17%
   4135  3346    68786.4142  1742    68888.0434    68769.8989  2577025    0.17%
   4144  3325    68792.9561  1519    68888.0434    68769.8989  2562301    0.17%
   4154  3283    68781.7680  1657    68888.0434    68769.8989  2522693    0.17%
   4163  3239    68784.1079  1753    68888.0434    68769.8989  2500810    0.17%
   4171  3332    68788.4623  1508    68888.0434    68769.8989  2567129    0.17%
   4181  3329    68796.7232  1524    68888.0434    68769.8989  2565884    0.17%
   4184  3195    68798.3538  1319    68888.0434    68769.8989  2483365    0.17%
   4185  3351    68789.8170  1703    68

Elapsed time = 7560.07 sec. (4351645.39 ticks, tree = 0.26 MB, solutions = 1)
   4715     5    68822.5240  1644    68888.0434    68822.5256  3448220    0.10%
   4716     5    68828.7912  1543    68888.0434    68822.5256  3449048    0.10%
   4717     4    68838.3006  1351    68888.0434    68822.5256  3448638    0.10%
   4719     6    68830.1729  1340    68888.0434    68822.5256  3449366    0.10%
   4720     7    68823.3475  1621    68888.0434    68822.5273  3450273    0.10%
   4721     9    68854.7468   971    68888.0434    68822.5273  3456212    0.10%
   4722     7    68826.0888  1557    68888.0434    68822.5273  3450833    0.10%
   4726     8    68852.1345  1249    68888.0434    68822.5273  3452924    0.10%
   4728    12        cutoff          68888.0434    68823.3494  3472549    0.09%
   4729     9    68831.2074  1370    68888.0434    68823.3494  3457458    0.09%
Elapsed time = 7771.31 sec. (4484526.31 ticks, tree = 0.26 MB, solutions = 1)
   4730     9    68842.2874  1370    68888.0

  10256  2647    68887.0661   865    68888.0434    68876.2515  4680225    0.02%
  10364  3082        cutoff          68888.0434    68876.5271  4826551    0.02%
  10466  2928    68884.5801   711    68888.0434    68876.8872  4772573    0.02%
  10580  3190    68881.2735   792    68888.0434    68877.5924  4863002    0.02%
  10681  3323        cutoff          68888.0434    68877.6970  4896299    0.02%
  10785  3103        cutoff          68888.0434    68877.7074  4838611    0.02%
  10917  3396    68887.8427   613    68888.0434    68877.7074  4918356    0.02%
Elapsed time = 8750.97 sec. (4874093.09 ticks, tree = 876.50 MB, solutions = 1)
  11018  3260    68886.0798   699    68888.0434    68877.7074  4884165    0.02%
  11130  3670    68887.8463   628    68888.0434    68877.7074  4989675    0.02%
  11245  3671        cutoff          68888.0434    68877.7074  4992182    0.02%
  11370  3763    68887.1384   742    68888.0434    68878.2259  5030846    0.01%
  11540  3582    68886.3986   702    688

  22569  7729        cutoff          68888.0434    68883.1645  7844013    0.01%
  22701  7748    68888.0240   784    68888.0434    68883.2443  7837450    0.01%
  22837  7690    68886.6255   992    68888.0434    68883.2443  7916657    0.01%
  22981  7691        cutoff          68888.0434    68883.3917  7894147    0.01%
  23106  7677        cutoff          68888.0434    68883.3917  7945177    0.01%
Elapsed time = 9570.00 sec. (5220094.33 ticks, tree = 2169.63 MB, solutions = 1)
Nodefile size = 137.53 MB (99.53 MB after compression)
  23237  7629        cutoff          68888.0434    68883.4301  8094833    0.01%
  23361  7702        cutoff          68888.0434    68883.4301  7913938    0.01%
  23494  7655        cutoff          68888.0434    68883.5330  8075888    0.01%
  23646  7579    68887.7264   816    68888.0434    68883.5692  8191351    0.01%
  23798  7668    68887.2322   816    68888.0434    68883.6293  8089673    0.01%
  23944  7666    68887.7863   848    68888.0434    68883.6934  8

68888.04343194816

In [33]:
10103.39 / 3600

2.806497222222222