# Simulation template

In this notebook we run the machine learning analysis of topological phase transitions occurring  in both nearest-neighbours SSH models (ssh1) and second neighbours models (ssh2) as decribed in the paper [Machine learning topological phases in real space](https://arxiv.org/abs/1901.01963).

## Defining parameters

In [1]:
%%time
%load_ext autoreload
%autoreload 2
from simulation import *

CPU times: user 764 ms, sys: 274 ms, total: 1.04 s
Wall time: 602 ms


In [2]:
%%time
### Dataset and simulation parameters
csv_path = SSH1_PERIODIC_100_6561_CSV 
model_name = "DecisionTreeClassifier"
model_kw = {"criterion":"entropy"}
allowed_windings = [0,1]
simulation_dir = SSH1_PERIODIC_100_6561_SIMULATION_DIR
val_split = 0.9  
features_to_use = None 
shuffle_features = False
random_state = 137                    

### Running a simulation
n_experiments = 5
start_n = 0
fit_params = None
shuffle_rows = True
pred_params = None
random_features = False
######### DON'T SET THIS TO TRUE UNLESS YOUR DATASET IS SMALL!! WILL FLOOD YOUR MEMORY!!!
store_in_lists = False   
########## BELOW ARE THE PARAMETERS THAT CONTROL WHAT WILL BE SAVED
save_eigenvector=True
save_hamiltonian=True 
save_accuracy=True 
save_models=True

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.2 µs


In [3]:
# Parameters
model_kw = {"criterion": "entropy"}
allowed_windings = [0, 1]
val_split = 0.9
features_to_use = None
shuffle_features = False
n_experiments = 5
start_n = 0
fit_params = None
shuffle_rows = True
pred_params = None
random_features = False
store_in_lists = False
save_eigenvector = True
save_hamiltonian = True
save_accuracy = True
save_models = True
csv_path = "/home/rio/ssh_csvs/ssh1/periodic_220_6561.csv"
model_name = "DecisionTreeClassifier"
simulation_dir = "/home/rio/ssh_simulations/ssh1/periodic_220_6561"
random_state = 383


In [4]:
%%time
#Starting an instance of Simulation with a decision tree
model = DecisionTreeClassifier(criterion="entropy")
#dict_args = {"csv_path":csv_path, "model":model, "allowed_windings":allowed_windings,\
#             "simulation_dir":simulation_dir, "val_split":val_split, "features_to_use":features_to_use,\
#            "shuffle_features":shuffle_features, "random_state":random_state}
simulation = Simulation(csv_path,model_name,model_kw,allowed_windings,simulation_dir,val_split,features_to_use,\
                       shuffle_features,random_state)
#simulation = Simulation(**dict_args)

print("Info on all data: \n")
simulation.dataframe.info()
simulation.dataframe.head()

  mask |= (ar1 == a)


Info on all data: 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1443420 entries, 0 to 1443419
Columns: 228 entries, id to feat219
dtypes: float64(223), int32(3), object(2)
memory usage: 2.4+ GB
CPU times: user 29.5 s, sys: 1.86 s, total: 31.4 s
Wall time: 38.4 s


Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat210,feat211,feat212,feat213,feat214,feat215,feat216,feat217,feat218,feat219
0,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,0.06742,0.06742,...,0.06742,0.06742,0.06742,0.06742,0.06742,0.06742,0.06742,0.06742,0.06742,0.06742
1,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,0.095346,0.095307,...,0.091484,0.09068,0.092868,0.092214,0.09395,0.093447,0.094725,0.094376,0.095191,0.094997
2,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,0.095346,0.095191,...,0.08021,0.077137,0.085564,0.083022,0.089801,0.087826,0.092868,0.091484,0.094725,0.09395
3,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,0.06742,-0.06742,...,0.06742,-0.06742,0.06742,-0.06742,0.06742,-0.06742,0.06742,-0.06742,0.06742,-0.06742
4,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,0.095346,-0.095307,...,0.091484,-0.09068,0.092868,-0.092214,0.09395,-0.093447,0.094725,-0.094376,0.095191,-0.094997


#### Checking initialization

In [5]:
%%time
n_features = simulation.n_features
n_hamiltonians = simulation.n_hamiltonians
n_ts = simulation.n_ts

print("n_features: ", n_features)
print("n_hamiltonians: ", n_hamiltonians)
print("n_ts: ", n_ts)

n_features:  220
n_hamiltonians:  6561
n_ts:  2
CPU times: user 157 µs, sys: 11 µs, total: 168 µs
Wall time: 92.7 µs


In [6]:
%%time
n_total = len(simulation.dataframe)
n_train = len(simulation.dataframe[simulation.dataframe.type_of == "train"])
n_val = len(simulation.dataframe[simulation.dataframe.type_of == "val"])
n_test = len(simulation.dataframe[simulation.dataframe.type_of == "test"])
n_train_hamiltonians = len(simulation.train_ids)
n_val_hamiltonians = len(simulation.val_ids)
n_test_hamiltonians = len(simulation.test_ids)
n_total_hamiltonians = n_train_hamiltonians + n_val_hamiltonians + n_test_hamiltonians

print("% train: ", n_train/n_total)
print("% val: ",  n_val/n_total)
print("% test: ", n_test/n_total)
print("% train + val + test: ", (n_train+n_val+n_test)/n_total)
print("\n")
print("number of train hamiltonians: ", n_train_hamiltonians)
print("number of val hamiltonians: ", n_val_hamiltonians)
print("number of test hamiltonians: ", n_test_hamiltonians)
print("total number of hamiltonians: ", n_total_hamiltonians)
print("\n")
print("train ids: ", simulation.train_ids)
print("val ids: ", simulation.val_ids)
print("test ids: ", simulation.test_ids)

% train:  0.9358329522938577
% val:  0.0
% test:  0.06416704770614236
% train + val + test:  1.0


number of train hamiltonians:  6140
number of val hamiltonians:  0
number of test hamiltonians:  421
total number of hamiltonians:  6561


train ids:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 

## Running a simulation

In [7]:
%%time
simulation.run_simulation(n_experiments, start_n, fit_params, shuffle_rows, pred_params, random_features, \
                          store_in_lists, save_eigenvector, save_hamiltonian, save_accuracy,\
                          save_models)

simulation.dataframe.head(10)

running experiments:   0%|          | 0/5 [00:00<?, ?it/s]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments:  20%|██        | 1/5 [00:45<03:03, 45.88s/it]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments:  40%|████      | 2/5 [01:31<02:17, 45.89s/it]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments:  60%|██████    | 3/5 [02:18<01:32, 46.12s/it]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments:  80%|████████  | 4/5 [03:05<00:46, 46.40s/it]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments: 100%|██████████| 5/5 [03:52<00:00, 46.61s/it]

running experiments: 100%|██████████| 5/5 [03:52<00:00, 46.52s/it]

CPU times: user 3min 52s, sys: 628 ms, total: 3min 52s
Wall time: 3min 52s





Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat210,feat211,feat212,feat213,feat214,feat215,feat216,feat217,feat218,feat219
0,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,0.06742,0.06742,...,0.06742,0.06742,0.06742,0.06742,0.06742,0.06742,0.06742,0.06742,0.06742,0.06742
1,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,0.095346,0.095307,...,0.091484,0.09068,0.092868,0.092214,0.09395,0.093447,0.094725,0.094376,0.095191,0.094997
2,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,0.095346,0.095191,...,0.08021,0.077137,0.085564,0.083022,0.089801,0.087826,0.092868,0.091484,0.094725,0.09395
3,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,0.06742,-0.06742,...,0.06742,-0.06742,0.06742,-0.06742,0.06742,-0.06742,0.06742,-0.06742,0.06742,-0.06742
4,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,0.095346,-0.095307,...,0.091484,-0.09068,0.092868,-0.092214,0.09395,-0.093447,0.094725,-0.094376,0.095191,-0.094997
5,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,0.095346,-0.095191,...,0.08021,-0.077137,0.085564,-0.083022,0.089801,-0.087826,0.092868,-0.091484,0.094725,-0.09395
6,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,-0.095346,0.085564,...,0.013569,0.029464,0.083022,-0.053818,0.087826,-0.095191,0.024239,-0.062439,-0.058223,0.018935
7,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,-0.000142,-0.002581,...,0.026726,-0.029329,0.021458,-0.024102,0.016119,-0.018796,0.010728,-0.013429,0.005302,-0.008018
8,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,-6.8e-05,-0.005375,...,0.051491,-0.055988,0.042008,-0.046825,0.031977,-0.037053,0.021529,-0.026797,0.010801,-0.016191
9,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,-0.095346,0.094725,...,-0.039608,0.029463,-0.058223,0.049236,-0.073811,0.06645,-0.085563,0.08021,-0.092868,0.089801


#### Viewing a random sample

In [8]:
%%time
simulation.dataframe.sample(frac=0.1, replace=False)

CPU times: user 265 ms, sys: 0 ns, total: 265 ms
Wall time: 264 ms


Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat210,feat211,feat212,feat213,feat214,feat215,feat216,feat217,feat218,feat219
707265,3214,H_-0.05_0.75_1.00102.dat,-0.05,0.75,1.001017,1,1,val,0.001477,0.085090,...,-0.086106,-0.003783,-0.079530,-0.083539,0.003968,-0.082495,0.083628,-0.001662,0.082403,0.080779
681901,3099,H_-0.1_-0.95_1.00121.dat,-0.10,-0.95,1.001212,1,1,val,0.095346,-0.080221,...,0.080210,-0.095346,-0.044495,0.083013,-0.002723,-0.049221,0.049237,0.002704,-0.083022,0.044512
1197939,5445,H_1.35_-1.1_0.00048696.dat,1.35,-1.10,0.000487,0,0,val,-0.000194,0.007133,...,-0.071931,-0.076527,0.060205,0.065708,-0.046716,-0.052964,0.031858,0.038669,-0.016067,-0.023241
167636,761,H_-1.55_-0.4_-0.00037725.dat,-1.55,-0.40,-0.000377,0,0,val,-0.017517,0.001583,...,-0.043212,-0.025339,-0.020349,-0.038569,0.074718,0.085055,-0.095335,-0.093120,0.072889,0.059121
1077798,4899,H_1_-0.05_5.16473e-05.dat,1.00,-0.05,0.000052,0,0,train,0.000052,0.002065,...,-0.072024,-0.070689,0.092200,0.091667,-0.093458,-0.093835,0.075537,0.076748,-0.042116,-0.043912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486729,2212,H_-0.65_-0.75_1.00814.dat,-0.65,-0.75,1.008135,1,1,val,0.095346,-0.053703,...,-0.080210,0.087772,0.066450,0.019071,0.053818,-0.095346,-0.087826,0.018799,-0.018935,0.087880
839171,3814,H_0.35_-1.65_1.00089.dat,0.35,-1.65,1.000895,1,1,val,-0.095346,0.027121,...,0.039441,0.071881,-0.089863,0.056261,-0.024061,-0.081510,0.093981,-0.042311,0.007977,0.088752
110536,502,H_-1.7_-1.2_-0.00260307.dat,-1.70,-1.20,-0.002603,0,0,val,0.000096,-0.047847,...,0.026954,-0.069144,-0.081600,0.045916,-0.068443,0.092489,0.046801,0.001109,0.092238,-0.091926
1030068,4682,H_0.85_1.25_1.00339.dat,0.85,1.25,1.003389,1,1,val,-0.000302,-0.040796,...,-0.051294,0.080911,0.016556,0.025503,0.075689,-0.043332,0.094970,-0.089353,0.064248,-0.088327


#### Checking train/val/test splits again

In [9]:
%%time
n_total = len(simulation.dataframe)
n_train = len(simulation.dataframe[simulation.dataframe.type_of == "train"])
n_val = len(simulation.dataframe[simulation.dataframe.type_of == "val"])
n_test = len(simulation.dataframe[simulation.dataframe.type_of == "test"])
n_train_hamiltonians = len(simulation.train_ids)
n_val_hamiltonians = len(simulation.val_ids)
n_test_hamiltonians = len(simulation.test_ids)
n_total_hamiltonians = n_train_hamiltonians + n_val_hamiltonians + n_test_hamiltonians

print("% train: ", n_train/n_total)
print("% val: ",  n_val/n_total)
print("% test: ", n_test/n_total)
print("% train + val + test: ", (n_train+n_val+n_test)/n_total)
print("\n")
print("number of train hamiltonians: ", n_train_hamiltonians)
print("number of val hamiltonians: ", n_val_hamiltonians)
print("number of test hamiltonians: ", n_test_hamiltonians)
print("total number of hamiltonians: ", n_total_hamiltonians)
print("\n")
print("train ids: ", simulation.train_ids)
print("val ids: ", simulation.val_ids)
print("test ids: ", simulation.test_ids)

% train:  0.09358329522938577
% val:  0.8422496570644719
% test:  0.06416704770614236
% train + val + test:  1.0


number of train hamiltonians:  614
number of val hamiltonians:  5526
number of test hamiltonians:  421
total number of hamiltonians:  6561


train ids:  [5291, 342, 4898, 5631, 121, 2638, 4140, 3703, 3825, 4521, 1988, 4830, 2514, 46, 3139, 1500, 3943, 5127, 2652, 2073, 3779, 6254, 5191, 792, 4180, 2522, 5051, 5152, 3934, 3325, 3298, 4009, 968, 1095, 3039, 3392, 3191, 2896, 2417, 3646, 3933, 3185, 3617, 6018, 27, 2129, 1990, 2974, 4402, 210, 3136, 6097, 5431, 4852, 2775, 3217, 827, 3441, 3881, 5190, 2547, 4553, 2643, 2628, 1497, 3478, 3443, 3454, 786, 1686, 4707, 6357, 5006, 6034, 3165, 4435, 4917, 5113, 4068, 3451, 1813, 3140, 5487, 1626, 3581, 3904, 5812, 4661, 6105, 5009, 4568, 2260, 377, 4547, 581, 3661, 304, 4403, 2876, 3796, 4751, 6136, 2423, 4387, 2848, 6405, 3374, 4569, 2018, 231, 3060, 1415, 3683, 5514, 2315, 4280, 5872, 4051, 375, 1644, 940, 6275, 181, 3442, 5935,

#### Checking summaries

In [10]:
%%time
ham_summary = simulation.hamiltonian_summary
print("length of ham_summary: ", len(ham_summary))
ham_summary

length of ham_summary:  6561
CPU times: user 184 µs, sys: 2 µs, total: 186 µs
Wall time: 122 µs


Unnamed: 0_level_0,t1,t2,type_of,0,1,phase,pred_phase
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,-2.0,-2.00,test,0.540909,0.459091,999,0
1,-2.0,-1.95,test,0.677273,0.322727,0,0
2,-2.0,-1.90,test,0.790909,0.209091,0,0
3,-2.0,-1.85,test,0.840909,0.159091,0,0
4,-2.0,-1.80,val,0.909091,0.090909,0,0
...,...,...,...,...,...,...,...
6556,2.0,1.80,val,0.895455,0.104545,0,0
6557,2.0,1.85,test,0.850000,0.150000,0,0
6558,2.0,1.90,test,0.822727,0.177273,0,0
6559,2.0,1.95,test,0.690909,0.309091,0,0


In [11]:
eigen_summary = simulation.eigenvector_summary
print("length of ham_summary: ", len(eigen_summary))
eigen_summary

length of ham_summary:  1443420


Unnamed: 0,id,phase,pred_phase,type_of
0,0,999,0,test
1,0,999,1,test
2,0,999,0,test
3,0,999,0,test
4,0,999,0,test
...,...,...,...,...
1443415,6560,999,1,test
1443416,6560,999,1,test
1443417,6560,999,1,test
1443418,6560,999,0,test


#### Checking accuracies

In [12]:
simulation.accuracy

{'eigenvector_train': 0.9914050932780575,
 'eigenvector_val': 0.9737069390978186,
 'eigenvector_test': 0.7673426573426574,
 'hamiltonian_train': 1.0,
 'hamiltonian_val': 1.0,
 'hamiltonian_test': 1.0}

#### Checking data stored in  memory

In [13]:
ham_summary_list = simulation.hamiltonian_summary_list
ham_summary_list

[]

In [14]:
eigen_summary_list = simulation.eigenvector_summary_list
eigen_summary_list

[]

In [15]:
accuracy_list = simulation.accuracy_list
accuracy_list

{'eigenvector_train': [],
 'eigenvector_val': [],
 'eigenvector_test': [],
 'hamiltonian_train': [],
 'hamiltonian_val': [],
 'hamiltonian_test': []}