### Import files

In [1]:
import pandas as pd
from sklearn import preprocessing
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import random
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pylab
import copy

### Set settings:

In [2]:
settings_feature_selection = {'min_correlation_with_y':0.1,
                              'max_correlation_with_other_x':0.75,
                              'max_avg_multicor': 10,
                              'max_vif_value':5,
                              'normalize_before_corr':True}

### Open files

In [3]:
data = pd.read_csv('data/data_after_cleaning.csv')
data['last_start_time']= data['last_start_time']/1000

if settings_feature_selection['normalize_before_corr'] == True:
    x = data.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    data = pd.DataFrame(x_scaled,columns=data.columns)

In [4]:
data = data.drop(columns=['last_start_time'])

In [5]:
data.head()

Unnamed: 0,ETHBTC__technical_analysis_candles__rsi,ETHBTC__technical_analysis_candles__macd,ETHBTC__technical_analysis_candles__signal,ETHBTC__technical_analysis_candles__macdhist,ETHBTC__technical_analysis_candles__sma_5,ETHBTC__technical_analysis_candles__sma_10,ETHBTC__technical_analysis_candles__sma_21,ETHBTC__technical_analysis_candles__sma_50,ETHBTC__technical_analysis_candles__sma_100,ETHBTC__technical_analysis_candles__sma_200,...,general_info__exchange_info__Turkish_Lira,general_info__exchange_info__New_Taiwan_Dollar,general_info__exchange_info__Ukrainian_hryvnia,general_info__exchange_info__Venezuelan_bolivar_fuerte,general_info__exchange_info__Vietnamese_dong,general_info__exchange_info__South_African_Rand,general_info__exchange_info__IMF_Special_Drawing_Rights,general_info__exchange_info__Silver_Troy_Ounce,general_info__exchange_info__Gold_Troy_Ounce,close_price_next_min
0,0.36791,0.438206,0.432895,0.553331,0.186357,0.188278,0.188984,0.189844,0.185346,0.179431,...,0.869085,0.890474,0.882885,0.889371,0.886392,0.874944,0.884926,0.856091,0.89619,0.186183
1,0.364153,0.435847,0.431298,0.556706,0.185377,0.188057,0.188695,0.189741,0.185407,0.179474,...,0.869068,0.890321,0.882856,0.889343,0.886865,0.874677,0.884899,0.856062,0.896434,0.184355
2,0.360068,0.433964,0.429563,0.558169,0.184961,0.187589,0.188488,0.189661,0.185494,0.179521,...,0.86921,0.890461,0.883001,0.889483,0.887005,0.874826,0.885037,0.85621,0.896678,0.184233
3,0.309854,0.431152,0.427495,0.561753,0.184055,0.186839,0.188282,0.189556,0.185611,0.179575,...,0.869416,0.890666,0.883211,0.889687,0.887208,0.875044,0.885238,0.85643,0.896922,0.184111
4,0.306204,0.428967,0.42531,0.56299,0.183296,0.186162,0.188022,0.189468,0.185754,0.17962,...,0.869729,0.890975,0.883529,0.889994,0.887515,0.875372,0.885542,0.856762,0.897167,0.182527


### VIF normal

In [34]:
#to overcome /0 problems
data = data.replace(0,0.000000000000000000000000000000000000000000000000000000000000000000000001)

In [35]:
features = "+".join([column for column in data if "close_price_next_min" not in column])
# get y and X dataframes based on this regression:
y, X = dmatrices('close_price_next_min ~' + features, data, return_type='dataframe')

vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


In [38]:
standard_vif_df = vif.loc[(vif['VIF Factor'] < 5)]

In [25]:
from patsy import dmatrices
import time
from statsmodels.stats.outliers_influence import variance_inflation_factor
columns_to_analyze = list(data.columns) #the columns that have high enough cor

removed_columns = {}
while True:
    start =time.time()
    features = "+".join([column for column in columns_to_analyze if "price_next_min" not in column])
    y, X = dmatrices('close_price_next_min ~' + features, data, return_type='dataframe')
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif["features"] = X.columns
    vif = vif.loc[vif['features'] !='Intercept']
    x = vif.loc[vif['VIF Factor'].idxmax()]
    if vif['VIF Factor'].max() > settings_feature_selection["max_vif_value"]:
        print(f"remove the column: {x['features']} with a score of: {x['VIF Factor']}")
        removed_columns[x['features']] = x['VIF Factor']
        stop = time.time()
        print(len(columns_to_analyze),'features left |', 'the duration of this iteration is:', (stop-start)/60, 'min')
        print("------------------------")
    else:
        break
    columns_to_analyze.remove(x['features'])
                            

remove the column: general_info__econ_bitcoin_aggregator__V2Tone_4_1_hour with a score of: 218314005883.48907
345 features left | the duration of this iteration is: 11.915785415967305 min
------------------------
remove the column: last_start_time with a score of: 5332859239041.439
344 features left | the duration of this iteration is: 11.738107200463613 min
------------------------
remove the column: ETHBTC__ticker_info__open_time with a score of: inf
343 features left | the duration of this iteration is: 11.706211499373119 min
------------------------
remove the column: ETHBTC__technical_analysis_candles__macd with a score of: inf
342 features left | the duration of this iteration is: 11.540544180075328 min
------------------------
remove the column: ETH__events_aggregator__NumArticles_1_hour with a score of: inf
341 features left | the duration of this iteration is: 11.450629103183747 min
------------------------
remove the column: ETH__events_aggregator__NumArticles_8_hour with a s

remove the column: general_info__exchange_info__IMF_Special_Drawing_Rights with a score of: 124581.64537713803
303 features left | the duration of this iteration is: 8.600137054920197 min
------------------------
remove the column: general_info__exchange_info__Swiss_Franc with a score of: 106824.04755830881
302 features left | the duration of this iteration is: 8.487334458033244 min
------------------------
remove the column: ETHBTC__ticker_info__close_price with a score of: 97551.96903569571
301 features left | the duration of this iteration is: 8.383790330092113 min
------------------------
remove the column: general_info__biggest_coin_streamer__BTCUSDC with a score of: 96178.23106262455
300 features left | the duration of this iteration is: 8.264431794484457 min
------------------------
remove the column: general_info__exchange_info__Sri_Lankan_Rupee with a score of: 79217.54412938944
299 features left | the duration of this iteration is: 8.218907050291698 min
----------------------

remove the column: general_info__exchange_info__Pakistani_Rupee with a score of: 18531.19581173593
262 features left | the duration of this iteration is: 6.229498000939687 min
------------------------
remove the column: general_info__exchange_info__British_Pound_Sterling with a score of: 17332.724626318082
261 features left | the duration of this iteration is: 5.992849508921306 min
------------------------
remove the column: general_info__biggest_coin_streamer__FETUSDT with a score of: 17282.682237644265
260 features left | the duration of this iteration is: 5.910018702348073 min
------------------------
remove the column: general_info__exchange_info__Indian_Rupee with a score of: 16345.997669279412
259 features left | the duration of this iteration is: 5.950861318906148 min
------------------------
remove the column: ETHBTC__technical_analysis_candles__sma_50 with a score of: 16086.732776812027
258 features left | the duration of this iteration is: 5.899356226126353 min
--------------

remove the column: ETHBTC__general_info__market_cap with a score of: 1380.5285470948083
221 features left | the duration of this iteration is: 4.961978880564372 min
------------------------
remove the column: general_info__exchange_info__Norwegian_Krone with a score of: 1352.429606272879
220 features left | the duration of this iteration is: 4.855188306172689 min
------------------------
remove the column: general_info__stock_exchange_index__STI_Index with a score of: 1322.4116336690322
219 features left | the duration of this iteration is: 4.889657886823018 min
------------------------
remove the column: general_info__exchange_info__Ether with a score of: 1241.1267504228226
218 features left | the duration of this iteration is: 4.765245008468628 min
------------------------
remove the column: general_info__biggest_coin_streamer__EOSBTC with a score of: 1157.686391267846
217 features left | the duration of this iteration is: 4.726613847414653 min
------------------------
remove the col

remove the column: general_info__econ_bitcoin_aggregator__V2Tone_4_24_hour with a score of: 217.75794463409818
180 features left | the duration of this iteration is: 3.1442635456720986 min
------------------------
remove the column: general_info__events_aggregator__NumMentions_24_hour with a score of: 209.7441077922177
179 features left | the duration of this iteration is: 3.06132489045461 min
------------------------
remove the column: general_info__biggest_coin_streamer__NEOBTC with a score of: 197.66579767223774
178 features left | the duration of this iteration is: 3.0363722443580627 min
------------------------
remove the column: ETHBTC__ticker_info__taker_buy_asset_volume with a score of: 196.8862610898271
177 features left | the duration of this iteration is: 2.98648677666982 min
------------------------
remove the column: general_info__stock_exchange_index__BSE_SENSEX with a score of: 181.6605689733975
176 features left | the duration of this iteration is: 2.9254308382670087 mi

remove the column: general_info__biggest_coin_streamer__MATICBTC with a score of: 26.889496568314588
139 features left | the duration of this iteration is: 1.7165942986806233 min
------------------------
remove the column: general_info__events_aggregator__AvgTone_7_days with a score of: 26.121012197467824
138 features left | the duration of this iteration is: 1.6703522364298502 min
------------------------
remove the column: general_info__exchange_info__XRP with a score of: 24.955464443519983
137 features left | the duration of this iteration is: 1.627112909158071 min
------------------------
remove the column: general_info__econ_bitcoin_aggregator__V2Tone_5_8_hour with a score of: 22.972347337014146
136 features left | the duration of this iteration is: 1.5684762835502624 min
------------------------
remove the column: general_info__events_aggregator__NumMentions_7_days with a score of: 20.091036660047948
135 features left | the duration of this iteration is: 1.5599518219629924 min
--

In [46]:
list(vif['features'])


['ETHBTC__technical_analysis_candles__rsi',
 'ETHBTC__technical_analysis_candles__signal',
 'ETHBTC__technical_analysis_candles__macdhist',
 'ETHBTC__ticker_info__number_trades',
 'ETHBTC__ticker_info__taker_buy_quote_asset_volume',
 'ETHBTC__ticker_info__bidQty',
 'ETHBTC__ticker_info__askQty',
 'ETHBTC__general_info__market_cap_change_24h',
 'ETH__events_aggregator__AvgTone_1_hour',
 'ETH__events_aggregator__AvgTone_8_hour',
 'ETH__events_aggregator__AvgTone_24_hour',
 'ETH__events_aggregator__AvgTone_7_days',
 'ETH__events_aggregator__NumSources_7_days',
 'ETH__events_aggregator__NumMentions_1_hour',
 'ETH__events_aggregator__NumMentions_8_hour',
 'ETH__events_aggregator__GoldsteinScale_1_hour',
 'ETH__events_aggregator__GoldsteinScale_8_hour',
 'ETH__events_aggregator__GoldsteinScale_24_hour',
 'ETH__events_aggregator__GoldsteinScale_7_days',
 'ETH__events_aggregator__count_1_hour',
 'ETH__events_aggregator__count_8_hour',
 'ETH__events_aggregator__count_24_hour',
 'ETH__twitter_in

<br>
<br>
<br>
<br><br>
<br>
<br>
<br><br>
<br>
<br>


<br><br>
<br>
<br>
<br><br>
<br>
<br>
<br><br>
<br>
<br>
<br>

### Clustering

In [6]:
class solver:
    def __init__(self, G, delta = 1.0/44):
        """
        Args:
        delta: "cleanness" parameter. Defaults to the assumed value of 1/44 given in the paper
        """
        self.__G__ = G
        self.__reset_caches__()
        self.__clusters__ = None
        self.__delta__ = delta
    def __reset_caches__(self):
        self.__G_nodes__ = set(self.__G__.nodes())
        self.__N_plus_cache__ = dict()
    def __remove_cluster__(self, C):
        self.__G__.remove_nodes_from(C)
        self.__reset_caches__()
    def positive_neighbours(self, u):
        """
        Returns N+(u), or {u} U {v : e(u, v) = +}
        Args:
            G: a networkx graph where presence of edges indicates a + edge
            u: a node in G
        """
        if u in self.__N_plus_cache__:
            return self.__N_plus_cache__[u]
        res = set([u])
        for i in self.__G__.neighbors(u):
            res.add(i)
        self.__N_plus_cache__[u] = res
        return res

    def delta_good(self, v, C, delta):
        """
        Returns true if v is delta-good with respect to C, where C is a cluster in
        G
        Args:
            G: a networkx graph
            v: a vertex v in G
            C: a set of vertices in G
            delta: "cleanness" parameter
        """
        Nv = self.positive_neighbours(v)
        return (len(Nv & C) >= (1.0 - delta) * len(C) and
                len(Nv & (self.__G_nodes__ - C)) <= delta * len(C))

    def run(self):
        """
        Runs the "cautious algorithm" from the paper.
        """
        if self.__clusters__ is None:
            self.__clusters__ = []
            while len(self.__G_nodes__) > 0:
                # Make sure we try all the vertices until we run out
                vs = random.sample(self.__G_nodes__, len(self.__G_nodes__))
                Av = None
                for v in vs:
                    Av = self.positive_neighbours(v).copy()
                    # Vertex removal step
                    for x in self.positive_neighbours(v):
                        if not self.delta_good(x, Av, 3 * self.__delta__):
                            Av.remove(x)
                    # Vertex addition step
                    Y = set(y for y in self.__G_nodes__
                              if self.delta_good(y, Av, 7 * self.__delta__))
                    Av = Av | Y
                    if len(Av) > 0:
                        break
               # Second quit condition: all sets Av are empty
                if len(Av) == 0:
                    break
                self.__clusters__.append(Av)
                self.__remove_cluster__(Av)
            # add all remaining vertices as singleton clusters
            for v in self.__G_nodes__:
                self.__clusters__.append(set([v]))
        return self.__clusters__

In [7]:
'''
This prework function starts with dataset, deletes all features with lower correlation than threshold
than turns a correlation matrix into an adjacency matrix for further process. also returns matrix for omvormer
'''
def prework(data, threshold_correlation):
    #create new corr matrix:
    correlations = data.corr(method='pearson').abs()
    #turn corr matrix to numpy
    numpy_correlations = correlations.to_numpy()
    np.fill_diagonal(numpy_correlations,0)
    #turn numpy to adjacency (1 or 0 depanding on threshold)
    adjancency_matrix = np.where(numpy_correlations > threshold_correlation,1,0)
    np.fill_diagonal(adjancency_matrix,0)
    return adjancency_matrix,numpy_correlations,correlations

In [8]:
'''
initiates clustering, takes adjacency matrix an creates a clustered graph
'''
def create_cluster(adjancency_matrix):    
    G=nx.from_numpy_matrix(adjancency_matrix)
    
#     nx.draw(G)
#     pylab.figure(2)
#     pos=nx.spring_layout(G)
#     nx.draw(G,pos, with_labels=True) 
    # nx.draw_edges
#     # show graphs
#     pylab.show()
    
    return G

In [9]:
adj_matrix,numpy_matrix, corr_matrix = prework(data,0.75)

In [10]:
columns,omvormer,counter =corr_matrix.columns, {}, 0
for i in columns:
    omvormer[counter] = i
    counter += 1
# write_as_log2(f'omvormer_dict_index_column_{len(columns)}', omvormer)
# omvormer

In [11]:
# my_df = pd.DataFrame(clusters_omgevormd)


In [12]:
def cautiousAlg(numpy_matrix_local, threshold_local):
    #bouw adj matrix from numpy using thres
    adj_matrix = np.where(numpy_matrix_local > threshold_local,1,0)
    np.fill_diagonal(adj_matrix,0)
    
    G = create_cluster(adj_matrix)
    b = solver(G)
    clusters = b.run()
    clusters_flat = [[i for i in clusters[j]] for j in range(len(clusters))]
#     print(len(clusters_flat))
    return clusters_flat

In [13]:
cluster = cautiousAlg(numpy_matrix,threshold_local=0.75)
cluster

[[145, 18, 114, 146, 24, 42, 111],
 [65],
 [23, 25, 27, 28],
 [194],
 [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
 [154, 147, 148, 149],
 [75, 76],
 [67],
 [137],
 [82],
 [77],
 [183],
 [104],
 [81, 85, 89],
 [30],
 [155, 156, 157],
 [72],
 [3],
 [189, 190],
 [136],
 [138],
 [49, 53, 57],
 [130, 134],
 [112, 110],
 [143,
  144,
  151,
  153,
  219,
  158,
  159,
  160,
  161,
  162,
  163,
  164,
  165,
  166,
  167,
  168,
  169,
  170,
  223,
  172,
  173,
  174,
  179],
 [94],
 [225, 222],
 [185],
 [193],
 [50, 54, 58],
 [192],
 [48, 52, 56],
 [73],
 [218],
 [224, 228, 221],
 [210],
 [96],
 [242, 227, 229, 231, 232, 234, 239],
 [99],
 [267, 237, 246, 248],
 [26],
 [202],
 [128, 132, 124],
 [216, 217],
 [91],
 [97],
 [119],
 [105],
 [263],
 [257],
 [63],
 [115, 118],
 [79],
 [139],
 [120],
 [288],
 [182],
 [95],
 [215],
 [188],
 [265],
 [113],
 [108, 109],
 [1, 2],
 [176],
 [44],
 [78],
 [98],
 [101],
 [171],
 [121],
 [274],
 [47],
 [126],
 [100],
 [131, 123, 127],
 [177],
 [

In [14]:
def find_next_merge(clusters_flat_local):
    loc_minMatrix = []   
    supercountert = 0
    for a in clusters_flat_local:
        supercountert += 1
        loc_minVector = []
        countert = 0
        for b in clusters_flat_local:

            loc_min=10^10000
            if b != a:
                countert+=1
                for i in a: #only for A
                    for j in b: #only for B
        #                 print(i,j)
        #                 print(numpy_matrix[i][j]) #take minimum
                        if numpy_matrix[i][j] < loc_min:
                            loc_min = numpy_matrix[i][j] 
        #         print(loc_min)
                loc_minVector.append([loc_min,countert])
        loc_minMatrix.append([max(loc_minVector),supercountert])
    next_merge = [max(loc_minMatrix)[0][1],max(loc_minMatrix)[1],max(loc_minMatrix)[0][0]]
    return next_merge

In [15]:
def merge_clusters(flat_clusters_local,a,b):
    test_clusters = copy.deepcopy(flat_clusters_local)
    for i in test_clusters[b]:
        test_clusters[a].append(i) 
    del test_clusters[b]
    new_clusters = test_clusters

    return new_clusters

In [16]:
def merge_on_thres(clusters_local,thres):
    threshold_loc = 1
    while threshold_loc > thres:
        [a,b,threshold_loc] = find_next_merge(clusters_local)
#         print([a,b,threshold_loc])
        clusters_local = merge_clusters(clusters_local,a,b)
#     print(len(clusters_local))
    return clusters_local

In [17]:
clusters_omgevormd = [] 
for i in range(len(cluster)):
    clusters_omgevormd.append([omvormer.get(item,item) for item in cluster[i]])
len(clusters_omgevormd)

255

In [18]:
omvormer

{0: 'ETHBTC__technical_analysis_candles__rsi',
 1: 'ETHBTC__technical_analysis_candles__macd',
 2: 'ETHBTC__technical_analysis_candles__signal',
 3: 'ETHBTC__technical_analysis_candles__macdhist',
 4: 'ETHBTC__technical_analysis_candles__sma_5',
 5: 'ETHBTC__technical_analysis_candles__sma_10',
 6: 'ETHBTC__technical_analysis_candles__sma_21',
 7: 'ETHBTC__technical_analysis_candles__sma_50',
 8: 'ETHBTC__technical_analysis_candles__sma_100',
 9: 'ETHBTC__technical_analysis_candles__sma_200',
 10: 'ETHBTC__technical_analysis_candles__sma_1440',
 11: 'ETHBTC__technical_analysis_candles__ema_5',
 12: 'ETHBTC__technical_analysis_candles__ema_13',
 13: 'ETHBTC__technical_analysis_candles__ema_20',
 14: 'ETHBTC__technical_analysis_candles__ema_40',
 15: 'ETHBTC__technical_analysis_candles__ema_55',
 16: 'ETHBTC__technical_analysis_candles__ema_200',
 17: 'ETHBTC__technical_analysis_candles__ema_1440',
 18: 'ETHBTC__ticker_info__open_time',
 19: 'ETHBTC__ticker_info__open',
 20: 'ETHBTC__tic

In [19]:
# np.save('omvormer.npy', omvormer) 

In [20]:
'''
output order:
0: correlation threshold
1: len(best_cautious_clusters)
2: len(bijbehordende_greedy_optimalization)
3: klusters:    best_cautious_clusters
4: klusters:    bijbehordende_greedy_optimalization
5: len(bijbehordende_cautious)
6: len(beste_greedy_optimizer)
7: klusters:    bijbehordende_cautious
8: klusters:    beste_greedy_optimizer
'''
clusterMatrix = []
clusterVector = []
for cautiousThres in np.arange(0.75,0.951,0.05):
    print(cautiousThres)
    
    smallest = 10000
    smallest_cautious= 100000
    smallest_cluster=[]
    len_smallest_initial=0
    smallest_cautious_greedy=0    
    for it in range(400):
        initial_cluster = cautiousAlg(numpy_matrix,cautiousThres)
        if len(initial_cluster) < smallest_cautious:
            smallest_cautious = len(initial_cluster)
            smallest_cautious_cluster = initial_cluster
            smallest_cautious_greedy = merge_on_thres(initial_cluster,cautiousThres-0.05)
        final_cluster = merge_on_thres(initial_cluster,cautiousThres-0.05)
        if len(final_cluster) <smallest:
            len_smallest_initial = len(initial_cluster)
            smallest_initial=initial_cluster
            smallest = len(final_cluster)
            smallest_cluster = final_cluster 
        clusterMatrix.append([cautiousThres,it,len(initial_cluster),len(final_cluster),final_cluster])
              
    clusterVector.append([cautiousThres,smallest_cautious,len(smallest_cautious_greedy),smallest_cautious_cluster,smallest_cautious_greedy,len_smallest_initial,smallest,smallest_initial, smallest_cluster])
# print(clusterMatrix[0:2])

0.75
0.8
0.8500000000000001
0.9000000000000001
0.9500000000000002


In [21]:
for i in range(len(clusterVector)):
    print(clusterVector[i][0:3],clusterVector[i][5:7])

[0.75, 179, 112] [179, 112]
[0.8, 167, 127] [185, 125]
[0.8500000000000001, 215, 142] [243, 139]
[0.9000000000000001, 291, 158] [291, 153]
[0.9500000000000002, 299, 174] [299, 161]


In [149]:
'''
output order:
0: correlation threshold
1: len(best_cautious_clusters)
2: len(bijbehordende_greedy_optimalization)
3: klusters:    best_cautious_clusters
4: klusters:    bijbehordende_greedy_optimalization
5: len(bijbehordende_cautious)
6: len(beste_greedy_optimizer)
7: klusters:    bijbehordende_cautious
8: klusters:    beste_greedy_optimizer
'''
clusterMatrix = []
clusterVector = []
for cautiousThres in np.arange(0.75,0.951,0.05):
    print(cautiousThres)
    
    smallest = 10000
    smallest_cautious= 100000
    smallest_cluster=[]
    len_smallest_initial=0
    smallest_cautious_greedy=0    
    for it in range(250):
        initial_cluster = cautiousAlg(numpy_matrix,cautiousThres)
        if len(initial_cluster) < smallest_cautious:
            smallest_cautious = len(initial_cluster)
            smallest_cautious_cluster = initial_cluster
            smallest_cautious_greedy = merge_on_thres(initial_cluster,0.7)
        final_cluster = merge_on_thres(initial_cluster,0.7)
        if len(final_cluster) <smallest:
            len_smallest_initial = len(initial_cluster)
            smallest_initial=initial_cluster
            smallest = len(final_cluster)
            smallest_cluster = final_cluster 
        clusterMatrix.append([cautiousThres,it,len(initial_cluster),len(final_cluster),final_cluster])
              
    clusterVector.append([cautiousThres,smallest_cautious,len(smallest_cautious_greedy),smallest_cautious_cluster,smallest_cautious_greedy,len_smallest_initial,smallest,smallest_initial, smallest_cluster])
# print(clusterMatrix[0:2])

0.7
0.75
0.8
0.8500000000000001
0.9000000000000001
0.9500000000000002


In [162]:
for i in range(len(clusterVector)):
    print(clusterVector[i][0:3],clusterVector[i][5:7])

[0.7, 289, 125] [290, 117]
[0.75, 179, 121] [185, 115]
[0.8, 172, 115] [172, 115]
[0.8500000000000001, 215, 128] [258, 121]
[0.9000000000000001, 292, 125] [292, 117]
[0.9500000000000002, 300, 126] [300, 121]


In [180]:
clusterVector

[[0.7,
  289,
  125,
  [[120],
   [66],
   [130, 134, 285],
   [42, 110, 111, 112, 18, 114, 147, 24],
   [214],
   [100],
   [188],
   [61],
   [288],
   [4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16],
   [218],
   [75, 76],
   [1, 2],
   [38, 39, 40, 41],
   [108, 109],
   [104],
   [105],
   [50, 54, 58],
   [73],
   [62],
   [267, 237, 246, 248],
   [94],
   [146],
   [138],
   [60],
   [23, 25, 27, 28],
   [119],
   [88],
   [79],
   [82],
   [171],
   [257],
   [10],
   [106],
   [260],
   [47],
   [211],
   [262],
   [102],
   [137],
   [131, 123, 127],
   [192],
   [122],
   [57, 53],
   [68],
   [189, 190],
   [0],
   [30],
   [96],
   [93],
   [265],
   [121],
   [32],
   [178, 181],
   [84],
   [139],
   [136],
   [97],
   [51],
   [44],
   [140, 141],
   [180],
   [135],
   [187],
   [182],
   [64],
   [98, 99],
   [71],
   [198],
   [126],
   [184],
   [186],
   [191, 199],
   [195],
   [196],
   [26],
   [95],
   [128, 132, 124],
   [74],
   [90, 86],
   [77],
   [48, 52, 56],

In [165]:
# pd.DataFrame(clusterVector).to_csv("clusterVector.csv",index=False)
clusterdataframe= pd.read_csv('clusterVector.csv')  

In [166]:
clusterdataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.7,289,125,"[[120], [66], [130, 134, 285], [42, 110, 111, ...","[[120], [66], [130, 134, 285], [42, 110, 111, ...",290,117,"[[288], [130, 134, 285], [4, 5, 6, 7, 8, 9, 11...","[[288], [130, 134, 285], [4, 5, 6, 7, 8, 9, 11..."
1,0.75,179,121,"[[51], [183], [42, 111, 145, 18, 114, 146, 147...","[[51], [183], [42, 111, 145, 18, 114, 146, 147...",185,115,"[[145, 18, 146, 147, 148, 149, 24, 154, 155, 1...","[[145, 18, 146, 147, 148, 149, 24, 154, 155, 1..."
2,0.8,172,115,"[[183], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...","[[183], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...",172,115,"[[183], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...","[[183], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,..."
3,0.85,215,128,"[[49, 53, 57], [128, 132], [146], [4, 5, 6, 7,...","[[49, 53, 57], [128, 132], [175, 81], [3], [65...",258,121,"[[182], [144, 143], [145, 146, 147, 148, 149, ...","[[182], [144, 143], [106], [26], [191], [186],..."
4,0.9,292,125,"[[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...","[[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...",292,117,"[[103], [122], [265], [96], [129, 133], [80], ...","[[103], [122], [265], [96], [125], [81], [4, 5..."
5,0.95,300,126,"[[140], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...","[[140], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...",300,121,"[[68], [109], [84], [117], [274], [88], [101],...","[[68], [109], [117], [274], [88], [101], [99],..."
