# Simulation template

In this notebook we run the machine learning analysis of topological phase transitions occurring  in both nearest-neighbours SSH models (ssh1) and second neighbours models (ssh2) as decribed in the paper [Machine learning topological phases in real space](https://arxiv.org/abs/1901.01963).

## Defining parameters

In [1]:
%%time
%load_ext autoreload
%autoreload 2
from simulation import *

CPU times: user 765 ms, sys: 266 ms, total: 1.03 s
Wall time: 586 ms


In [2]:
%%time
### Dataset and simulation parameters
csv_path = SSH1_PERIODIC_100_6561_CSV 
model_name = "DecisionTreeClassifier"
model_kw = {"criterion":"entropy"}
allowed_windings = [0,1]
simulation_dir = SSH1_PERIODIC_100_6561_SIMULATION_DIR
val_split = 0.9  
features_to_use = None 
shuffle_features = False
random_state = 137                    

### Running a simulation
n_experiments = 5
start_n = 0
fit_params = None
shuffle_rows = True
pred_params = None
random_features = False
######### DON'T SET THIS TO TRUE UNLESS YOUR DATASET IS SMALL!! WILL FLOOD YOUR MEMORY!!!
store_in_lists = False   
########## BELOW ARE THE PARAMETERS THAT CONTROL WHAT WILL BE SAVED
save_eigenvector=True
save_hamiltonian=True 
save_accuracy=True 
save_models=True

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.72 µs


In [3]:
# Parameters
model_kw = {"criterion": "entropy"}
allowed_windings = [0, 1]
val_split = 0.9
features_to_use = None
shuffle_features = False
n_experiments = 5
start_n = 0
fit_params = None
shuffle_rows = True
pred_params = None
random_features = False
store_in_lists = False
save_eigenvector = True
save_hamiltonian = True
save_accuracy = True
save_models = True
csv_path = "/home/rio/ssh_csvs/ssh1/periodic_140_6561.csv"
model_name = "DecisionTreeClassifier"
simulation_dir = "/home/rio/ssh_simulations/ssh1/periodic_140_6561"
random_state = 147


In [4]:
%%time
#Starting an instance of Simulation with a decision tree
model = DecisionTreeClassifier(criterion="entropy")
#dict_args = {"csv_path":csv_path, "model":model, "allowed_windings":allowed_windings,\
#             "simulation_dir":simulation_dir, "val_split":val_split, "features_to_use":features_to_use,\
#            "shuffle_features":shuffle_features, "random_state":random_state}
simulation = Simulation(csv_path,model_name,model_kw,allowed_windings,simulation_dir,val_split,features_to_use,\
                       shuffle_features,random_state)
#simulation = Simulation(**dict_args)

print("Info on all data: \n")
simulation.dataframe.info()
simulation.dataframe.head()

Info on all data: 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 918540 entries, 0 to 918539
Columns: 148 entries, id to feat139
dtypes: float64(143), int32(3), object(2)
memory usage: 1.0+ GB
CPU times: user 11.9 s, sys: 983 ms, total: 12.9 s
Wall time: 15.1 s


Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat130,feat131,feat132,feat133,feat134,feat135,feat136,feat137,feat138,feat139
0,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,0.084515,0.084515,...,0.084515,0.084515,0.084515,0.084515,0.084515,0.084515,0.084515,0.084515,0.084515,0.084515
1,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,0.119523,0.119403,...,0.107686,0.105251,0.111901,0.109905,0.115216,0.113673,0.117602,0.116526,0.119042,0.118441
2,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,0.119523,0.119042,...,0.074521,0.065845,0.090009,0.082598,0.102604,0.096696,0.111901,0.107686,0.117602,0.115216
3,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,-0.084515,0.084515,...,-0.084515,0.084515,-0.084515,0.084515,-0.084515,0.084515,-0.084515,0.084515,-0.084515,0.084515
4,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,-0.00203,-0.007389,...,-0.05368,-0.058417,-0.043892,-0.048835,-0.03375,-0.03886,-0.023336,-0.028571,-0.012734,-0.018053


#### Checking initialization

In [5]:
%%time
n_features = simulation.n_features
n_hamiltonians = simulation.n_hamiltonians
n_ts = simulation.n_ts

print("n_features: ", n_features)
print("n_hamiltonians: ", n_hamiltonians)
print("n_ts: ", n_ts)

n_features:  140
n_hamiltonians:  6561
n_ts:  2
CPU times: user 416 µs, sys: 42 µs, total: 458 µs
Wall time: 314 µs


In [6]:
%%time
n_total = len(simulation.dataframe)
n_train = len(simulation.dataframe[simulation.dataframe.type_of == "train"])
n_val = len(simulation.dataframe[simulation.dataframe.type_of == "val"])
n_test = len(simulation.dataframe[simulation.dataframe.type_of == "test"])
n_train_hamiltonians = len(simulation.train_ids)
n_val_hamiltonians = len(simulation.val_ids)
n_test_hamiltonians = len(simulation.test_ids)
n_total_hamiltonians = n_train_hamiltonians + n_val_hamiltonians + n_test_hamiltonians

print("% train: ", n_train/n_total)
print("% val: ",  n_val/n_total)
print("% test: ", n_test/n_total)
print("% train + val + test: ", (n_train+n_val+n_test)/n_total)
print("\n")
print("number of train hamiltonians: ", n_train_hamiltonians)
print("number of val hamiltonians: ", n_val_hamiltonians)
print("number of test hamiltonians: ", n_test_hamiltonians)
print("total number of hamiltonians: ", n_total_hamiltonians)
print("\n")
print("train ids: ", simulation.train_ids)
print("val ids: ", simulation.val_ids)
print("test ids: ", simulation.test_ids)

% train:  0.9358329522938577
% val:  0.0
% test:  0.06416704770614236
% train + val + test:  1.0


number of train hamiltonians:  6140
number of val hamiltonians:  0
number of test hamiltonians:  421
total number of hamiltonians:  6561


train ids:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 

## Running a simulation

In [7]:
%%time
simulation.run_simulation(n_experiments, start_n, fit_params, shuffle_rows, pred_params, random_features, \
                          store_in_lists, save_eigenvector, save_hamiltonian, save_accuracy,\
                          save_models)

simulation.dataframe.head(10)

running experiments:   0%|          | 0/5 [00:00<?, ?it/s]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments:  20%|██        | 1/5 [00:19<01:18, 19.61s/it]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments:  40%|████      | 2/5 [00:39<00:58, 19.56s/it]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments:  60%|██████    | 3/5 [00:58<00:38, 19.38s/it]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments:  80%|████████  | 4/5 [01:16<00:19, 19.17s/it]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments: 100%|██████████| 5/5 [01:37<00:00, 19.70s/it]

running experiments: 100%|██████████| 5/5 [01:37<00:00, 19.53s/it]

CPU times: user 1min 37s, sys: 241 ms, total: 1min 37s
Wall time: 1min 37s





Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat130,feat131,feat132,feat133,feat134,feat135,feat136,feat137,feat138,feat139
0,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,0.084515,0.084515,...,0.084515,0.084515,0.084515,0.084515,0.084515,0.084515,0.084515,0.084515,0.084515,0.084515
1,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,0.119523,0.119403,...,0.107686,0.105251,0.111901,0.109905,0.115216,0.113673,0.117602,0.116526,0.119042,0.118441
2,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,0.119523,0.119042,...,0.074521,0.065845,0.090009,0.082598,0.102604,0.096696,0.111901,0.107686,0.117602,0.115216
3,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,-0.084515,0.084515,...,-0.084515,0.084515,-0.084515,0.084515,-0.084515,0.084515,-0.084515,0.084515,-0.084515,0.084515
4,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,-0.00203,-0.007389,...,-0.05368,-0.058417,-0.043892,-0.048835,-0.03375,-0.03886,-0.023336,-0.028571,-0.012734,-0.018053
5,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,-0.003825,-0.014518,...,-0.095784,-0.101807,-0.081479,-0.088989,-0.064555,-0.073312,-0.045557,-0.055278,-0.025094,-0.035468
6,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,-0.119522,-0.11851,...,-0.027102,-0.01123,-0.057094,-0.042483,-0.082972,-0.070673,-0.102869,-0.093769,-0.115352,-0.110107
7,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,0.001198,-0.014856,...,-0.116254,-0.118928,-0.104678,-0.111475,-0.085558,-0.095987,-0.060272,-0.073581,-0.030641,-0.045872
8,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,0.119493,0.118049,...,-0.023989,-0.044511,0.018684,-0.002696,0.058973,0.039463,0.091742,0.076589,0.112811,0.103947
9,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,-0.002976,-0.024263,...,-0.115828,-0.108701,-0.118804,-0.119232,-0.106629,-0.114557,-0.080856,-0.095273,-0.044771,-0.063839


#### Viewing a random sample

In [8]:
%%time
simulation.dataframe.sample(frac=0.1, replace=False)

CPU times: user 110 ms, sys: 34 µs, total: 111 ms
Wall time: 110 ms


Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat130,feat131,feat132,feat133,feat134,feat135,feat136,feat137,feat138,feat139
447855,3198,H_-0.05_-0.05_0.501338.dat,-0.05,-0.05,0.501338,999,1,test,0.000594,0.066340,...,-0.051323,0.016633,-0.119428,-0.102298,-0.042553,-0.097044,0.085979,0.026016,0.110137,0.117494
253578,1811,H_-0.9_-0.55_-0.00170438.dat,-0.90,-0.55,-0.001704,0,0,val,0.000963,-0.030800,...,0.050989,0.077858,-0.032725,-0.001033,-0.100278,-0.079414,-0.118308,-0.118575,-0.077911,-0.099178
383935,2742,H_-0.35_1.45_1.00087.dat,-0.35,1.45,1.000874,1,1,val,-0.003754,-0.105682,...,-0.115633,0.077946,-0.048451,0.119440,0.055216,0.070993,0.117304,-0.030913,0.091060,-0.109541
20221,144,H_-1.95_1.15_0.000402349.dat,-1.95,1.15,0.000402,0,0,val,-0.000461,-0.040408,...,-0.116628,0.100673,-0.105469,0.079906,-0.086709,0.053379,-0.061699,0.023005,-0.032242,-0.009027
512198,3658,H_0.25_-1.35_1.00092.dat,0.25,-1.35,1.000915,1,1,val,-0.119523,-0.085931,...,-0.026599,0.061871,0.107688,0.041376,-0.107685,-0.113467,0.026594,0.100114,0.074523,-0.011374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206655,1476,H_-1.1_-1.1_0.501338.dat,-1.10,-1.10,0.501338,999,1,test,-0.009332,-0.027947,...,0.009332,0.027947,0.077589,-0.045697,0.116210,-0.101886,0.110442,-0.119158,0.062490,-0.090916
486234,3473,H_0.1_1.55_1.00116.dat,0.10,1.55,1.001159,1,1,val,0.119523,0.086369,...,-0.074522,0.010745,-0.119042,-0.078615,-0.090009,-0.119401,-0.005361,-0.086412,0.082598,-0.000031
585169,4179,H_0.55_0.4_-0.00289231.dat,0.55,0.40,-0.002892,0,0,val,-0.002401,0.047448,...,0.094925,0.056184,-0.080431,-0.036445,0.063352,0.015534,-0.044237,0.005877,0.023700,-0.027098
194273,1387,H_-1.15_-1.5_1.00465.dat,-1.15,-1.50,1.004648,1,1,val,0.113012,-0.017917,...,0.063083,0.111223,-0.118703,-0.035130,0.084937,-0.067416,0.012788,0.119196,-0.100883,-0.081220


#### Checking train/val/test splits again

In [9]:
%%time
n_total = len(simulation.dataframe)
n_train = len(simulation.dataframe[simulation.dataframe.type_of == "train"])
n_val = len(simulation.dataframe[simulation.dataframe.type_of == "val"])
n_test = len(simulation.dataframe[simulation.dataframe.type_of == "test"])
n_train_hamiltonians = len(simulation.train_ids)
n_val_hamiltonians = len(simulation.val_ids)
n_test_hamiltonians = len(simulation.test_ids)
n_total_hamiltonians = n_train_hamiltonians + n_val_hamiltonians + n_test_hamiltonians

print("% train: ", n_train/n_total)
print("% val: ",  n_val/n_total)
print("% test: ", n_test/n_total)
print("% train + val + test: ", (n_train+n_val+n_test)/n_total)
print("\n")
print("number of train hamiltonians: ", n_train_hamiltonians)
print("number of val hamiltonians: ", n_val_hamiltonians)
print("number of test hamiltonians: ", n_test_hamiltonians)
print("total number of hamiltonians: ", n_total_hamiltonians)
print("\n")
print("train ids: ", simulation.train_ids)
print("val ids: ", simulation.val_ids)
print("test ids: ", simulation.test_ids)

% train:  0.09358329522938577
% val:  0.8422496570644719
% test:  0.06416704770614236
% train + val + test:  1.0


number of train hamiltonians:  614
number of val hamiltonians:  5526
number of test hamiltonians:  421
total number of hamiltonians:  6561


train ids:  [5469, 3611, 2150, 2002, 1647, 5475, 5812, 1400, 3808, 888, 4185, 395, 2179, 6454, 690, 3477, 2850, 4784, 6266, 1401, 2277, 3506, 3649, 626, 360, 3500, 2673, 1399, 4740, 4484, 5635, 3379, 5542, 1417, 292, 6324, 3146, 317, 1420, 761, 1867, 6223, 5842, 427, 2919, 3460, 1434, 6302, 3388, 3814, 6083, 1490, 3297, 4368, 1821, 4632, 935, 2250, 1694, 845, 710, 5025, 2924, 3160, 5518, 36, 1781, 1543, 4645, 2438, 4290, 375, 6282, 2598, 4110, 6228, 6269, 5006, 1107, 5601, 4900, 2384, 3656, 3828, 1317, 6502, 4714, 4917, 3782, 2195, 4951, 4727, 1813, 1510, 5377, 3400, 5626, 3595, 5147, 4407, 2621, 5224, 3319, 282, 1279, 2093, 2087, 3039, 361, 5784, 3745, 550, 2292, 2644, 1681, 215, 4967, 4292, 5206, 2353, 4231, 5956, 4851, 2070, 5314, 

#### Checking summaries

In [10]:
%%time
ham_summary = simulation.hamiltonian_summary
print("length of ham_summary: ", len(ham_summary))
ham_summary

length of ham_summary:  6561
CPU times: user 116 µs, sys: 1 µs, total: 117 µs
Wall time: 73.2 µs


Unnamed: 0_level_0,t1,t2,type_of,0,1,phase,pred_phase
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,-2.0,-2.00,test,0.485714,0.514286,999,1
1,-2.0,-1.95,test,0.657143,0.342857,0,0
2,-2.0,-1.90,test,0.821429,0.178571,0,0
3,-2.0,-1.85,test,0.907143,0.092857,0,0
4,-2.0,-1.80,val,0.928571,0.071429,0,0
...,...,...,...,...,...,...,...
6556,2.0,1.80,val,0.935714,0.064286,0,0
6557,2.0,1.85,test,0.878571,0.121429,0,0
6558,2.0,1.90,test,0.835714,0.164286,0,0
6559,2.0,1.95,test,0.735714,0.264286,0,0


In [11]:
eigen_summary = simulation.eigenvector_summary
print("length of ham_summary: ", len(eigen_summary))
eigen_summary

length of ham_summary:  918540


Unnamed: 0,id,phase,pred_phase,type_of
0,0,999,0,test
1,0,999,0,test
2,0,999,0,test
3,0,999,0,test
4,0,999,1,test
...,...,...,...,...
918535,6560,999,1,test
918536,6560,999,1,test
918537,6560,999,1,test
918538,6560,999,0,test


#### Checking accuracies

In [12]:
simulation.accuracy

{'eigenvector_train': 0.9872498836668218,
 'eigenvector_val': 0.9660074453234062,
 'eigenvector_test': 0.7957417582417582,
 'hamiltonian_train': 1.0,
 'hamiltonian_val': 1.0,
 'hamiltonian_test': 1.0}

#### Checking data stored in  memory

In [13]:
ham_summary_list = simulation.hamiltonian_summary_list
ham_summary_list

[]

In [14]:
eigen_summary_list = simulation.eigenvector_summary_list
eigen_summary_list

[]

In [15]:
accuracy_list = simulation.accuracy_list
accuracy_list

{'eigenvector_train': [],
 'eigenvector_val': [],
 'eigenvector_test': [],
 'hamiltonian_train': [],
 'hamiltonian_val': [],
 'hamiltonian_test': []}