In [1]:
import numpy as np
import pandas as pd

In [2]:
cities = pd.read_csv('cities.csv')

In [3]:
cities.head()

Unnamed: 0,CityId,X,Y
0,0,316.836739,2202.340707
1,1,4377.405972,336.602082
2,2,3454.158198,2820.053011
3,3,4688.099298,2935.898056
4,4,1010.696952,3236.750989


In [4]:
cities.tail()

Unnamed: 0,CityId,X,Y
197764,197764,149.828018,3134.756986
197765,197765,2615.299239,2267.979286
197766,197766,4775.889874,3103.846228
197767,197767,2994.230955,1931.764344
197768,197768,1354.764778,3218.100625


In [5]:
coors = cities[['X', 'Y']]

## Euclidean Distance between two coordinates:

In [6]:
coors.head()

Unnamed: 0,X,Y
0,316.836739,2202.340707
1,4377.405972,336.602082
2,3454.158198,2820.053011
3,4688.099298,2935.898056
4,1010.696952,3236.750989


We can calculate the distance between two consecutive rows as follows:

In [7]:
dist = np.sqrt((coors.X - coors.X.shift())**2 + (coors.Y - coors.Y.shift())**2)
dist.head()

0            NaN
1    4468.691432
2    2649.512214
3    1239.367061
4    3689.688402
dtype: float64

**TODO: Maybe there is a faster way?**

And the total distance:

In [8]:
np.sum(dist)

443429408.0060866

Random route:

In [9]:
idx = np.random.permutation(range(1, len(coors)))

This way the route does not include 0, which has to be the starting and the end point.

Then we can complete the route by:

In [10]:
idx = np.concatenate(([0], idx, [0]))

In [11]:
idx[:5]

array([     0, 154908, 139246,  86373, 136696])

In [12]:
idx[-5:]

array([ 41269,    645,  60276, 106613,      0])

The sequence of coordinates ordered by the route:

In [13]:
coors[['X', 'Y']].loc[idx].head()

Unnamed: 0,X,Y
0,316.836739,2202.340707
154908,2942.079072,1070.465493
139246,4542.484729,1783.395956
86373,2691.539391,2860.080421
136696,2587.084913,768.479842


In [14]:
def total_length(coors, route):
    df = coors.copy()
    
    route = np.concatenate(([0], route, [0]))
    
    df = df.loc[route]
    return np.sum(np.sqrt((df.X - df.X.shift())**2 + (df.Y - df.Y.shift())**2))

In [15]:
idx = np.random.permutation(range(1, len(coors)))

In [16]:
total_length(coors, idx)

442752673.9749419

In [17]:
total_length(coors, range(1, len(coors)))

443430860.2673667

## Modified total distance: add 10% for prime numbers

Let's just begin by testing with a smaller dataset

In [18]:
coors_sub = coors.loc[0:25] 

In [19]:
coors_sub.head()

Unnamed: 0,X,Y
0,316.836739,2202.340707
1,4377.405972,336.602082
2,3454.158198,2820.053011
3,4688.099298,2935.898056
4,1010.696952,3236.750989


In [20]:
np.random.seed(3)
idx = np.random.permutation(range(1,len(coors_sub)))
idx = np.concatenate(([0], idx, [0])) # add start and end

In [21]:
coors_sub = coors_sub.loc[idx]

In [22]:
coors_sub = coors_sub.reset_index()

In [23]:
coors_sub = coors_sub.rename(columns = {'index' : 'CityId'})

In [24]:
coors_sub['dist'] = np.sqrt((coors_sub.X - coors_sub.X.shift())**2 + (coors_sub.Y - coors_sub.Y.shift())**2)

In [25]:
coors_sub.head()

Unnamed: 0,CityId,X,Y,dist
0,0,316.836739,2202.340707,
1,19,3033.179607,515.217613,3197.64021
2,18,2352.743647,2489.939529,2088.664583
3,13,965.611152,1067.734281,1986.65657
4,24,3694.082279,734.949757,2748.69064


Drop the first row:

In [26]:
coors_sub = coors_sub.drop(0)

In [27]:
coors_sub.head()

Unnamed: 0,CityId,X,Y,dist
1,19,3033.179607,515.217613,3197.64021
2,18,2352.743647,2489.939529,2088.664583
3,13,965.611152,1067.734281,1986.65657
4,24,3694.082279,734.949757,2748.69064
5,16,4944.059453,2326.338189,2023.600771


dist[1] is the distance between row 0 and row 1, and so on, so that dist[10] is the distance of the 10th step, ecc

Every 10th step:

In [28]:
coors_sub[coors_sub.index % 10 == 0]

Unnamed: 0,CityId,X,Y,dist
10,23,3633.815728,2889.995167,192.791943
20,25,4646.266998,2884.589219,2008.1251


Check if not prime: (https://www.rookieslab.com/posts/fastest-way-to-check-if-a-number-is-prime-or-not)

**Todo: look for fastest version**

In [29]:
def not_prime(n):
    """
    Assumes that n is a positive natural number
    """
    # We know 1 is not a prime number
    if n == 1:
        return True

    i = 2
    # This will loop from 2 to int(sqrt(x))
    while i*i <= n:
        # Check if i divides x without leaving a remainder
        if n % i == 0:
            # This means that n has a factor in between 2 and sqrt(n)
            # So it is not a prime number
            return True
        i += 1
    # If we did not find any factor in the above loop,
    # then n is a prime number
    return False

Find every tenth step which is not prime:

In [30]:
coors_sub[coors_sub.CityId.apply(not_prime).values & (coors_sub.index % 10 == 0)]

Unnamed: 0,CityId,X,Y,dist
20,25,4646.266998,2884.589219,2008.1251


Find every tenth step which is not prime and increase dist by 10%.

In [31]:
idx2 = coors_sub.CityId.apply(not_prime).values & (coors_sub.index % 10 == 0)

In [32]:
idx2

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False])

In [33]:
coors_sub[idx2]

Unnamed: 0,CityId,X,Y,dist
20,25,4646.266998,2884.589219,2008.1251


In [34]:
coors_sub.loc[idx2, 'dist'] = coors_sub.loc[idx2, 'dist'] + coors_sub.loc[idx2, 'dist']/10

In [35]:
np.sum(coors_sub.dist)

60566.12398696876

In [36]:
def total_length_mod(coors, route):
    route = np.concatenate(([0], route, [0]))   
    df = coors.copy()
    df = df.loc[route].reset_index()
    df = df.rename(columns = {'index' : 'CityId'})
    df['dist'] = np.sqrt((df.X - df.X.shift())**2 + (df.Y - df.Y.shift())**2)
    df = df.drop(0)
    idx = df.CityId.apply(not_prime).values & (df.index % 10 == 0)
    df.loc[idx, 'dist'] = df.loc[idx, 'dist'] + df.loc[idx, 'dist'] / 10
    return np.sum(df['dist'])

Alternative:

In [37]:
def total_length_mod2(coors, route):
    route = np.concatenate(([0], route, [0]))   
    df = coors.copy()
    df = df.loc[route].reset_index()
    df = df.rename(columns = {'index' : 'CityId'})
    df['dist'] = np.sqrt((df.X - df.X.shift())**2 + (df.Y - df.Y.shift())**2)
    df = df.drop(0)
    idx = (df.index % 10 == 0)
    idx = df.loc[idx].CityId.apply(not_prime)
    idx = idx.index[idx.values]
    df.loc[idx, 'dist'] = df.loc[idx, 'dist'] + df.loc[idx, 'dist'] / 10
    return np.sum(df['dist'])

Check using same subset as before:

In [38]:
coors_sub = coors.loc[0:25] 

In [39]:
np.random.seed(3)
idx = np.random.permutation(range(1,26))

In [40]:
total_length_mod(coors_sub, idx)

60566.12398696876

In [41]:
total_length_mod2(coors_sub, idx)

60566.12398696876

Apply the function to the full df.

Take a random route:

In [42]:
idx = np.random.permutation(range(1, len(coors)))

In [43]:
total_length_mod(coors, idx)

447735621.4236019

In [44]:
total_length_mod2(coors, idx)

447735621.4236019

In [45]:
import timeit

In [46]:
timeit.timeit('total_length(coors, idx)', number=100, globals=globals()) # total length without penalizing non prime cities

4.4991587010081

In [47]:
timeit.timeit('total_length_mod(coors, idx)', number=100, globals=globals())

89.06319223919002

In [48]:
timeit.timeit('total_length_mod2(coors, idx)', number=100, globals=globals())

19.223640756943

Second version of modified tour length is faster