# Heuristic Performance
How well does the heuristic stack up when compared to other models?

In [1]:
import os, time, csv, random
import pandas as pd
import numpy as np
import smallest_insertion as si
import recursive_clustering as rc

## Clean Waterloo Data

### Make CSVs

In [2]:
# for file in os.listdir('waterloo_data'):
#     if not file.startswith('.'):
#         df = pd.read_csv('waterloo_data/' + file, sep=' ', index_col=[0], header=None)
#         df.to_csv('waterloo_data/{}.csv'.format(file[:-4]), index=False, header=False)

### Remove Duplicate Points

In [3]:
# for file in os.listdir('waterloo_data'):
#     if (not file.startswith('.')) and file.endswith('.csv'):
#         df = pd.read_csv('waterloo_data/' + file, header=None)
#         all_nodes = [(x, y) for x, y in zip(df[0].tolist(), df[1].tolist())]
#         m = np.unique(all_nodes, axis=0)
#         df2 = pd.DataFrame()
#         df2[0] = [x for x, y in m]
#         df2[1] = [y for x, y in m]
#         df2.head()
#         print(df2.shape)
#         df2.to_csv('waterloo_data/final/' + file, header=False, index=False)

### Organize and grab lower limit from website

In [4]:
country_info = {
    'ar': {'country': 'argentina', 'lower_lim': 837377},
    'bm': {'country': 'burma', 'lower_lim': 959011},
    'ca': {'country': 'canada', 'lower_lim': 1290319},
    'ch': {'country': 'china', 'lower_lim': 4565452},
    'dj': {'country': 'djibouti', 'lower_lim': 6656},
    'eg': {'country': 'egypt', 'lower_lim': 172350},
    'ei': {'country': 'ireland', 'lower_lim': 206171},
    'fi': {'country': 'finland', 'lower_lim': 520527},
    'gr': {'country': 'greece', 'lower_lim': 300899},
    'ho': {'country': 'honduras', 'lower_lim': 176940},
    'it': {'country': 'italy', 'lower_lim': 557315},
    'ja': {'country': 'japan', 'lower_lim': 491924},
    'kz': {'country': 'kazakhstan', 'lower_lim': 1061387},
    'lu': {'country': 'luxembourg', 'lower_lim': 11340},
    'mo': {'country': 'morocco', 'lower_lim': 427246},
    'mu': {'country': 'oman', 'lower_lim': 86891},
    'nu': {'country': 'nicaragua', 'lower_lim': 96132},
    'pm': {'country': 'panama', 'lower_lim': 114831},
    'qa': {'country': 'qatar', 'lower_lim': 9352},
    'rw': {'country': 'rwanda', 'lower_lim': 26051},
    'sw': {'country': 'sweden', 'lower_lim': 855597},
    'tz': {'country': 'tanzania', 'lower_lim': 394609},
    'vm': {'country': 'vietnam', 'lower_lim': 569288},
    'wi': {'country': 'western sahara', 'lower_lim': 27603},
    'ym': {'country': 'yemen', 'lower_lim': 238314},
    'zi': {'country': 'zimbabwe', 'lower_lim': 95345},
}

## Run tests

### Test Waterloo Data

In [5]:
for k in country_info.keys():
    country_dict = country_info[k]
    for file in os.listdir('waterloo_data/final'):
        if file.startswith(k): # inefficient, I know.
            all_nodes = []
            with open('waterloo_data/final/' + file) as f:
                for i, row in enumerate(csv.reader(f)):
                    all_nodes.append(np.array([float(row[0]), float(row[1])]))
            N = len(all_nodes)
            
            country_dict['N'] = N

            # recursive clustering
            K = 4
            _, country_dict['rc_path'], country_dict['rc_time'] = rc.implement_recursive_clustering(N, K, all_nodes)
        print(k)

KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(country_info).T
df.to_csv('waterloo_results.csv')

### Compare to SI

In [11]:
test_results = []
N = 100
number_of_tests = 1000
for i in range(number_of_tests):
    all_nodes = [[random.random(), random.random()] for _ in range(N)]
    _, rc_tour_length, _ = rc.solve_array(all_nodes, K=4, draw=False)
    _, si_tour_length = si.nearest_neighbor(all_nodes)
    m = [rc_tour_length, si_tour_length]
    print(i, end=', ')
    test_results.append(m)

t = np.array(test_results)
new_algorithm_better = t[:, 0] < t[:, 1]
sum(new_algorithm_better)/number_of_tests*100

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 

60.0