# Simulation template

In this notebook we run the machine learning analysis of topological phase transitions occurring  in both nearest-neighbours SSH models (ssh1) and second neighbours models (ssh2) as decribed in the paper [Machine learning topological phases in real space](https://arxiv.org/abs/1901.01963).

## Defining parameters

In [1]:
%%time
%load_ext autoreload
%autoreload 2
from simulation import *

CPU times: user 707 ms, sys: 307 ms, total: 1.01 s
Wall time: 589 ms


In [2]:
%%time
### Dataset and simulation parameters
csv_path = SSH1_PERIODIC_100_6561_CSV 
model_name = "DecisionTreeClassifier"
model_kw = {"criterion":"entropy"}
allowed_windings = [0,1]
simulation_dir = SSH1_PERIODIC_100_6561_SIMULATION_DIR
val_split = 0.9  
features_to_use = None 
shuffle_features = False
random_state = 137                    

### Running a simulation
n_experiments = 5
start_n = 0
fit_params = None
shuffle_rows = True
pred_params = None
random_features = False
######### DON'T SET THIS TO TRUE UNLESS YOUR DATASET IS SMALL!! WILL FLOOD YOUR MEMORY!!!
store_in_lists = False   
########## BELOW ARE THE PARAMETERS THAT CONTROL WHAT WILL BE SAVED
save_eigenvector=True
save_hamiltonian=True 
save_accuracy=True 
save_models=True

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 5.48 µs


In [3]:
# Parameters
model_kw = {"criterion": "entropy", "n_estimators": 25, "n_jobs": -1}
allowed_windings = [-1, 0, 1, 2]
val_split = 0.5
features_to_use = None
shuffle_features = False
n_experiments = 5
start_n = 0
fit_params = None
shuffle_rows = True
pred_params = None
random_features = False
store_in_lists = False
save_eigenvector = True
save_hamiltonian = True
save_accuracy = True
save_models = True
csv_path = "/home/rio/ssh_csvs/ssh2/periodic_220_6561.csv"
model_name = "RandomForestClassifier"
simulation_dir = "/home/rio/ssh_simulations/ssh2/periodic_220_6561"
random_state = 401


In [4]:
%%time
#Starting an instance of Simulation with a decision tree
model = DecisionTreeClassifier(criterion="entropy")
#dict_args = {"csv_path":csv_path, "model":model, "allowed_windings":allowed_windings,\
#             "simulation_dir":simulation_dir, "val_split":val_split, "features_to_use":features_to_use,\
#            "shuffle_features":shuffle_features, "random_state":random_state}
simulation = Simulation(csv_path,model_name,model_kw,allowed_windings,simulation_dir,val_split,features_to_use,\
                       shuffle_features,random_state)
#simulation = Simulation(**dict_args)

print("Info on all data: \n")
simulation.dataframe.info()
simulation.dataframe.head()

  mask |= (ar1 == a)


Info on all data: 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1443420 entries, 0 to 1443419
Columns: 228 entries, id to feat219
dtypes: float64(223), int32(3), object(2)
memory usage: 2.4+ GB
CPU times: user 25.9 s, sys: 1.2 s, total: 27.1 s
Wall time: 27.1 s


Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat210,feat211,feat212,feat213,feat214,feat215,feat216,feat217,feat218,feat219
0,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,0.130846,-0.300844,...,0.080274,0.115552,0.174807,-0.244872,-0.319478,0.059064,0.000568,0.244891,0.300844,-0.130846
1,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,-0.130846,-0.300844,...,-0.080274,0.115552,-0.174807,-0.244872,0.319478,0.059064,-0.000568,0.244891,-0.300844,-0.130846
2,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,-0.139243,-0.040832,...,0.177651,-0.085976,-0.069808,-0.109102,-0.140759,0.184935,0.184071,-0.05239,-0.040832,-0.139243
3,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,0.048483,0.015611,...,-0.065026,0.04435,0.035714,0.029502,0.046097,-0.070102,-0.070512,0.025241,0.015611,0.048483
4,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,-0.005059,0.009352,...,-0.013734,0.009909,0.002329,0.00825,0.011761,-0.009332,-0.00272,-0.004584,-0.009352,0.005059


#### Checking initialization

In [5]:
%%time
n_features = simulation.n_features
n_hamiltonians = simulation.n_hamiltonians
n_ts = simulation.n_ts

print("n_features: ", n_features)
print("n_hamiltonians: ", n_hamiltonians)
print("n_ts: ", n_ts)

n_features:  220
n_hamiltonians:  6561
n_ts:  2
CPU times: user 130 µs, sys: 7 µs, total: 137 µs
Wall time: 92.3 µs


In [6]:
%%time
n_total = len(simulation.dataframe)
n_train = len(simulation.dataframe[simulation.dataframe.type_of == "train"])
n_val = len(simulation.dataframe[simulation.dataframe.type_of == "val"])
n_test = len(simulation.dataframe[simulation.dataframe.type_of == "test"])
n_train_hamiltonians = len(simulation.train_ids)
n_val_hamiltonians = len(simulation.val_ids)
n_test_hamiltonians = len(simulation.test_ids)
n_total_hamiltonians = n_train_hamiltonians + n_val_hamiltonians + n_test_hamiltonians

print("% train: ", n_train/n_total)
print("% val: ",  n_val/n_total)
print("% test: ", n_test/n_total)
print("% train + val + test: ", (n_train+n_val+n_test)/n_total)
print("\n")
print("number of train hamiltonians: ", n_train_hamiltonians)
print("number of val hamiltonians: ", n_val_hamiltonians)
print("number of test hamiltonians: ", n_test_hamiltonians)
print("total number of hamiltonians: ", n_total_hamiltonians)
print("\n")
print("train ids: ", simulation.train_ids)
print("val ids: ", simulation.val_ids)
print("test ids: ", simulation.test_ids)

% train:  0.5450388660265203
% val:  0.0
% test:  0.45496113397347965
% train + val + test:  1.0


number of train hamiltonians:  3576
number of val hamiltonians:  0
number of test hamiltonians:  2985
total number of hamiltonians:  6561


train ids:  [41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315

## Running a simulation

In [7]:
%%time
simulation.run_simulation(n_experiments, start_n, fit_params, shuffle_rows, pred_params, random_features, \
                          store_in_lists, save_eigenvector, save_hamiltonian, save_accuracy,\
                          save_models)

simulation.dataframe.head(10)

running experiments:   0%|          | 0/5 [00:00<?, ?it/s]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments:  20%|██        | 1/5 [01:12<04:49, 72.26s/it]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments:  40%|████      | 2/5 [02:24<03:36, 72.15s/it]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments:  60%|██████    | 3/5 [03:36<02:24, 72.19s/it]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments:  80%|████████  | 4/5 [04:49<01:12, 72.47s/it]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments: 100%|██████████| 5/5 [06:01<00:00, 72.23s/it]

running experiments: 100%|██████████| 5/5 [06:01<00:00, 72.25s/it]

CPU times: user 39min 13s, sys: 1.72 s, total: 39min 15s
Wall time: 6min 1s





Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat210,feat211,feat212,feat213,feat214,feat215,feat216,feat217,feat218,feat219
0,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,0.130846,-0.300844,...,0.080274,0.115552,0.174807,-0.244872,-0.319478,0.059064,0.000568,0.244891,0.300844,-0.130846
1,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,-0.130846,-0.300844,...,-0.080274,0.115552,-0.174807,-0.244872,0.319478,0.059064,-0.000568,0.244891,-0.300844,-0.130846
2,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,-0.139243,-0.040832,...,0.177651,-0.085976,-0.069808,-0.109102,-0.140759,0.184935,0.184071,-0.05239,-0.040832,-0.139243
3,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,0.048483,0.015611,...,-0.065026,0.04435,0.035714,0.029502,0.046097,-0.070102,-0.070512,0.025241,0.015611,0.048483
4,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,-0.005059,0.009352,...,-0.013734,0.009909,0.002329,0.00825,0.011761,-0.009332,-0.00272,-0.004584,-0.009352,0.005059
5,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,-0.050074,-0.011117,...,0.049582,0.005255,0.001717,-0.054642,-0.055678,0.050847,0.051142,-0.00284,-0.011117,-0.050074
6,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,0.009938,-0.026818,...,-0.013334,0.022656,0.023129,-0.013172,-0.023569,-0.006879,-0.004162,0.025408,0.026818,-0.009938
7,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,-0.014007,0.025469,...,-0.039242,0.030036,0.008341,0.021972,0.032295,-0.02702,-0.008046,-0.011692,-0.025469,0.014007
8,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,-0.053983,-0.019696,...,0.075379,-0.071666,-0.05867,-0.017526,-0.045907,0.086976,0.089522,-0.040115,-0.019696,-0.053983
9,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,0.0687,0.013277,...,-0.056877,-0.023603,-0.014327,0.07824,0.077328,-0.060835,-0.062904,-0.003322,0.013277,0.0687


#### Viewing a random sample

In [8]:
%%time
simulation.dataframe.sample(frac=0.1, replace=False)

CPU times: user 259 ms, sys: 2 µs, total: 259 ms
Wall time: 257 ms


Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat210,feat211,feat212,feat213,feat214,feat215,feat216,feat217,feat218,feat219
1176204,5346,H_1.3_-2_1.00122.dat,1.30,-2.00,1.001216,1,1,train,-0.028182,-0.033447,...,-0.057627,0.036362,0.010789,-0.092969,0.009844,0.045049,0.002626,-0.017007,-0.033447,-0.028182
1308960,5949,H_1.65_-0.2_-1.00015.dat,1.65,-0.20,-1.000147,999,1,test,0.045592,-0.064836,...,-0.038854,0.012360,0.073321,-0.100417,0.063424,-0.019063,-0.069414,0.081151,-0.064836,0.045592
1242360,5647,H_1.45_0.9_-1.00444.dat,1.45,0.90,-1.004437,999,1,test,-0.072294,-0.070825,...,0.011437,0.021863,-0.018864,-0.014702,-0.061788,-0.039129,-0.060315,-0.059947,-0.070825,-0.072294
694702,3157,H_-0.1_1.95_2.00148.dat,-0.10,1.95,2.001481,999,0,test,-0.003728,0.029943,...,0.042169,0.021503,-0.018465,0.069913,0.002925,-0.071947,0.020517,0.010653,-0.029943,0.003728
1358352,6174,H_1.8_-1.1_1.00052.dat,1.80,-1.10,1.000524,1,1,train,-0.101565,0.083650,...,0.067101,0.009781,-0.095055,0.027602,0.068496,-0.076798,-0.030259,0.145508,0.083650,-0.101565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653576,2970,H_-0.2_0.7_0.000241024.dat,-0.20,0.70,0.000241,0,0,train,0.041364,-0.017479,...,-0.068060,0.030457,0.059334,-0.042547,-0.039904,0.063802,0.031332,-0.067468,-0.017479,0.041364
863655,3925,H_0.4_-0.15_1.00177.dat,0.40,-0.15,1.001775,1,1,val,0.019234,0.046108,...,-0.091416,-0.068488,0.092330,-0.016790,-0.024378,0.093381,-0.044287,-0.093558,0.046108,0.019234
658292,2992,H_-0.2_1.8_2.0013.dat,-0.20,1.80,2.001302,999,0,test,-0.060388,0.034169,...,0.020831,-0.066527,0.071955,-0.006003,0.102664,0.065917,0.026722,0.084717,-0.034169,0.060388
923563,4198,H_0.55_1.35_2.00305.dat,0.55,1.35,2.003050,999,0,test,0.003880,-0.001131,...,-0.021066,-0.025356,-0.015625,-0.020082,-0.010648,-0.014619,-0.003994,-0.009020,0.001131,-0.003880


#### Checking train/val/test splits again

In [9]:
%%time
n_total = len(simulation.dataframe)
n_train = len(simulation.dataframe[simulation.dataframe.type_of == "train"])
n_val = len(simulation.dataframe[simulation.dataframe.type_of == "val"])
n_test = len(simulation.dataframe[simulation.dataframe.type_of == "test"])
n_train_hamiltonians = len(simulation.train_ids)
n_val_hamiltonians = len(simulation.val_ids)
n_test_hamiltonians = len(simulation.test_ids)
n_total_hamiltonians = n_train_hamiltonians + n_val_hamiltonians + n_test_hamiltonians

print("% train: ", n_train/n_total)
print("% val: ",  n_val/n_total)
print("% test: ", n_test/n_total)
print("% train + val + test: ", (n_train+n_val+n_test)/n_total)
print("\n")
print("number of train hamiltonians: ", n_train_hamiltonians)
print("number of val hamiltonians: ", n_val_hamiltonians)
print("number of test hamiltonians: ", n_test_hamiltonians)
print("total number of hamiltonians: ", n_total_hamiltonians)
print("\n")
print("train ids: ", simulation.train_ids)
print("val ids: ", simulation.val_ids)
print("test ids: ", simulation.test_ids)

% train:  0.27251943301326015
% val:  0.27251943301326015
% test:  0.45496113397347965
% train + val + test:  1.0


number of train hamiltonians:  1788
number of val hamiltonians:  1788
number of test hamiltonians:  2985
total number of hamiltonians:  6561


train ids:  [287, 3811, 3094, 4949, 3452, 1452, 1179, 4000, 4948, 4650, 4573, 1515, 2799, 4552, 2553, 1286, 2606, 4092, 1751, 79, 4571, 361, 459, 140, 78, 6242, 3615, 1339, 5379, 1270, 222, 1171, 2086, 4304, 618, 2322, 3378, 5202, 1193, 4179, 3379, 5598, 4491, 2637, 1439, 2575, 377, 3270, 4559, 1085, 2801, 2007, 3915, 6492, 2171, 638, 1920, 6243, 3207, 4001, 2246, 2608, 5770, 4227, 939, 51, 5610, 1649, 5843, 1583, 3577, 5120, 807, 403, 1434, 2384, 1581, 1599, 1023, 4399, 2935, 3020, 1615, 2609, 772, 1447, 1688, 5614, 2643, 3216, 2808, 784, 4622, 3505, 74, 2970, 6319, 4376, 3018, 1653, 54, 2443, 887, 5054, 3779, 6252, 3820, 3859, 1268, 3734, 4155, 969, 4172, 779, 4648, 4233, 785, 1030, 4786, 3264, 933, 3249, 1841, 4739, 1440, 3290, 

#### Checking summaries

In [10]:
%%time
ham_summary = simulation.hamiltonian_summary
print("length of ham_summary: ", len(ham_summary))
ham_summary

length of ham_summary:  6561
CPU times: user 125 µs, sys: 0 ns, total: 125 µs
Wall time: 78 µs


Unnamed: 0_level_0,t1,t2,type_of,0,1,phase,pred_phase
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,-2.0,-2.00,test,0.618182,0.381818,999,-1
1,-2.0,-1.95,test,0.640909,0.359091,999,-1
2,-2.0,-1.90,test,0.595455,0.404545,999,-1
3,-2.0,-1.85,test,0.631818,0.368182,999,-1
4,-2.0,-1.80,test,0.586364,0.413636,999,-1
...,...,...,...,...,...,...,...
6556,2.0,1.80,test,0.718182,0.281818,999,-1
6557,2.0,1.85,test,0.686364,0.313636,999,-1
6558,2.0,1.90,test,0.763636,0.236364,999,-1
6559,2.0,1.95,test,0.745455,0.254545,999,-1


In [11]:
eigen_summary = simulation.eigenvector_summary
print("length of ham_summary: ", len(eigen_summary))
eigen_summary

length of ham_summary:  1443420


Unnamed: 0,id,phase,pred_phase,type_of
0,0,999,0,test
1,0,999,0,test
2,0,999,1,test
3,0,999,0,test
4,0,999,0,test
...,...,...,...,...
1443415,6560,999,1,test
1443416,6560,999,0,test
1443417,6560,999,1,test
1443418,6560,999,0,test


#### Checking accuracies

In [12]:
simulation.accuracy

{'eigenvector_train': 0.9998932275777913,
 'eigenvector_val': 0.9884507829977629,
 'eigenvector_test': 0.7942513368983958,
 'hamiltonian_train': 0.0,
 'hamiltonian_val': 0.0,
 'hamiltonian_test': 0.004901960784313725}

#### Checking data stored in  memory

In [13]:
ham_summary_list = simulation.hamiltonian_summary_list
ham_summary_list

[]

In [14]:
eigen_summary_list = simulation.eigenvector_summary_list
eigen_summary_list

[]

In [15]:
accuracy_list = simulation.accuracy_list
accuracy_list

{'eigenvector_train': [],
 'eigenvector_val': [],
 'eigenvector_test': [],
 'hamiltonian_train': [],
 'hamiltonian_val': [],
 'hamiltonian_test': []}