# Simulation template

In this notebook we run the machine learning analysis of topological phase transitions occurring  in both nearest-neighbours SSH models (ssh1) and second neighbours models (ssh2) as decribed in the paper [Machine learning topological phases in real space](https://arxiv.org/abs/1901.01963).

## Defining parameters

In [1]:
%%time
%load_ext autoreload
%autoreload 2
from simulation import *

CPU times: user 684 ms, sys: 273 ms, total: 956 ms
Wall time: 582 ms


In [2]:
%%time
### Dataset and simulation parameters
csv_path = SSH1_PERIODIC_100_6561_CSV 
model_name = "DecisionTreeClassifier"
model_kw = {"criterion":"entropy"}
allowed_windings = [0,1]
simulation_dir = SSH1_PERIODIC_100_6561_SIMULATION_DIR
val_split = 0.9  
features_to_use = None 
shuffle_features = False
random_state = 137                    

### Running a simulation
n_experiments = 5
start_n = 0
fit_params = None
shuffle_rows = True
pred_params = None
random_features = False
######### DON'T SET THIS TO TRUE UNLESS YOUR DATASET IS SMALL!! WILL FLOOD YOUR MEMORY!!!
store_in_lists = False   
########## BELOW ARE THE PARAMETERS THAT CONTROL WHAT WILL BE SAVED
save_eigenvector=True
save_hamiltonian=True 
save_accuracy=True 
save_models=True

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 5.96 µs


In [3]:
# Parameters
model_kw = {"criterion": "entropy"}
allowed_windings = [0, 1]
val_split = 0.9
features_to_use = None
shuffle_features = False
n_experiments = 5
start_n = 0
fit_params = None
shuffle_rows = True
pred_params = None
random_features = False
store_in_lists = False
save_eigenvector = True
save_hamiltonian = True
save_accuracy = True
save_models = True
csv_path = "/home/rio/ssh_csvs/ssh1/periodic_180_6561.csv"
model_name = "DecisionTreeClassifier"
simulation_dir = "/home/rio/ssh_simulations/ssh1/periodic_180_6561"
random_state = 257


In [4]:
%%time
#Starting an instance of Simulation with a decision tree
model = DecisionTreeClassifier(criterion="entropy")
#dict_args = {"csv_path":csv_path, "model":model, "allowed_windings":allowed_windings,\
#             "simulation_dir":simulation_dir, "val_split":val_split, "features_to_use":features_to_use,\
#            "shuffle_features":shuffle_features, "random_state":random_state}
simulation = Simulation(csv_path,model_name,model_kw,allowed_windings,simulation_dir,val_split,features_to_use,\
                       shuffle_features,random_state)
#simulation = Simulation(**dict_args)

print("Info on all data: \n")
simulation.dataframe.info()
simulation.dataframe.head()

  mask |= (ar1 == a)


Info on all data: 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1180980 entries, 0 to 1180979
Columns: 188 entries, id to feat179
dtypes: float64(183), int32(3), object(2)
memory usage: 1.6+ GB
CPU times: user 20.1 s, sys: 1.21 s, total: 21.4 s
Wall time: 26.9 s


Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat170,feat171,feat172,feat173,feat174,feat175,feat176,feat177,feat178,feat179
0,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,0.074536,0.074536,...,0.074536,0.074536,0.074536,0.074536,0.074536,0.074536,0.074536,0.074536,0.074536,0.074536
1,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,0.105409,0.105345,...,0.099052,0.097734,0.101326,0.10025,0.103106,0.102278,0.104383,0.103808,0.105152,0.104832
2,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,0.105409,0.105152,...,0.080748,0.075825,0.089392,0.085278,0.096296,0.093071,0.101326,0.099052,0.104383,0.103106
3,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,-0.074536,0.074536,...,-0.074536,0.074536,-0.074536,0.074536,-0.074536,0.074536,-0.074536,0.074536,-0.074536,0.074536
4,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,666,test,3e-05,-0.003649,...,-0.036024,-0.039459,-0.029026,-0.032545,-0.021886,-0.025472,-0.01464,-0.018274,-0.007323,-0.010988


#### Checking initialization

In [5]:
%%time
n_features = simulation.n_features
n_hamiltonians = simulation.n_hamiltonians
n_ts = simulation.n_ts

print("n_features: ", n_features)
print("n_hamiltonians: ", n_hamiltonians)
print("n_ts: ", n_ts)

n_features:  180
n_hamiltonians:  6561
n_ts:  2
CPU times: user 740 µs, sys: 53 µs, total: 793 µs
Wall time: 523 µs


In [6]:
%%time
n_total = len(simulation.dataframe)
n_train = len(simulation.dataframe[simulation.dataframe.type_of == "train"])
n_val = len(simulation.dataframe[simulation.dataframe.type_of == "val"])
n_test = len(simulation.dataframe[simulation.dataframe.type_of == "test"])
n_train_hamiltonians = len(simulation.train_ids)
n_val_hamiltonians = len(simulation.val_ids)
n_test_hamiltonians = len(simulation.test_ids)
n_total_hamiltonians = n_train_hamiltonians + n_val_hamiltonians + n_test_hamiltonians

print("% train: ", n_train/n_total)
print("% val: ",  n_val/n_total)
print("% test: ", n_test/n_total)
print("% train + val + test: ", (n_train+n_val+n_test)/n_total)
print("\n")
print("number of train hamiltonians: ", n_train_hamiltonians)
print("number of val hamiltonians: ", n_val_hamiltonians)
print("number of test hamiltonians: ", n_test_hamiltonians)
print("total number of hamiltonians: ", n_total_hamiltonians)
print("\n")
print("train ids: ", simulation.train_ids)
print("val ids: ", simulation.val_ids)
print("test ids: ", simulation.test_ids)

% train:  0.9358329522938577
% val:  0.0
% test:  0.06416704770614236
% train + val + test:  1.0


number of train hamiltonians:  6140
number of val hamiltonians:  0
number of test hamiltonians:  421
total number of hamiltonians:  6561


train ids:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 

## Running a simulation

In [7]:
%%time
simulation.run_simulation(n_experiments, start_n, fit_params, shuffle_rows, pred_params, random_features, \
                          store_in_lists, save_eigenvector, save_hamiltonian, save_accuracy,\
                          save_models)

simulation.dataframe.head(10)

running experiments:   0%|          | 0/5 [00:00<?, ?it/s]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments:  20%|██        | 1/5 [00:33<02:15, 33.90s/it]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments:  40%|████      | 2/5 [01:06<01:40, 33.65s/it]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments:  60%|██████    | 3/5 [01:42<01:08, 34.10s/it]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments:  80%|████████  | 4/5 [02:16<00:34, 34.14s/it]

THIS IS  predict_params:  None
THIS IS  the type of predict_params:  <class 'NoneType'>


running experiments: 100%|██████████| 5/5 [02:47<00:00, 33.22s/it]

running experiments: 100%|██████████| 5/5 [02:47<00:00, 33.49s/it]

CPU times: user 2min 47s, sys: 335 ms, total: 2min 47s
Wall time: 2min 47s





Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat170,feat171,feat172,feat173,feat174,feat175,feat176,feat177,feat178,feat179
0,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,0.074536,0.074536,...,0.074536,0.074536,0.074536,0.074536,0.074536,0.074536,0.074536,0.074536,0.074536,0.074536
1,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,0.105409,0.105345,...,0.099052,0.097734,0.101326,0.10025,0.103106,0.102278,0.104383,0.103808,0.105152,0.104832
2,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,0.105409,0.105152,...,0.080748,0.075825,0.089392,0.085278,0.096296,0.093071,0.101326,0.099052,0.104383,0.103106
3,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,-0.074536,0.074536,...,-0.074536,0.074536,-0.074536,0.074536,-0.074536,0.074536,-0.074536,0.074536,-0.074536,0.074536
4,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,3e-05,-0.003649,...,-0.036024,-0.039459,-0.029026,-0.032545,-0.021886,-0.025472,-0.01464,-0.018274,-0.007323,-0.010988
5,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,-0.000365,0.006989,...,0.067476,0.07296,0.055548,0.061662,0.04254,0.049164,0.028703,0.035709,0.014308,0.021558
6,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,1,test,-0.105377,-0.104528,...,-0.050435,-0.040484,-0.068577,-0.059834,-0.083723,-0.076569,-0.095209,-0.089958,-0.102533,-0.099416
7,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,-0.002187,-0.013191,...,-0.092361,-0.097165,-0.079781,-0.086545,-0.063714,-0.072143,-0.044863,-0.054587,-0.02405,-0.034646
8,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,0.105391,0.104094,...,0.016379,0.001727,0.044446,0.030711,0.06907,0.057316,0.088342,0.07948,0.10077,0.095486
9,0,H_-2_-2_0.501338.dat,-2.0,-2.0,0.501338,999,0,test,0.000845,0.015506,...,0.103951,0.105371,0.095109,0.100508,0.078897,0.087858,0.056573,0.068401,0.029866,0.043644


#### Viewing a random sample

In [8]:
%%time
simulation.dataframe.sample(frac=0.1, replace=False)

CPU times: user 175 ms, sys: 27 µs, total: 175 ms
Wall time: 174 ms


Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat170,feat171,feat172,feat173,feat174,feat175,feat176,feat177,feat178,feat179
1038159,5767,H_1.55_-1.2_0.000473277.dat,1.55,-1.20,0.000473,0,0,train,-0.000220,-0.060976,...,-0.091177,0.104951,-0.043074,-0.020732,0.100182,-0.100617,0.022131,0.041767,-0.104809,0.091885
433114,2406,H_-0.55_0.85_1.00066.dat,-0.55,0.85,1.000658,1,1,val,-0.000058,0.026641,...,0.091258,-0.075003,-0.104826,0.098644,0.100268,-0.105228,-0.078373,0.093617,0.042927,-0.065820
311087,1728,H_-0.95_-0.65_-0.00234999.dat,-0.95,-0.65,-0.002350,0,0,val,0.105404,-0.101576,...,-0.099423,0.085817,-0.098671,0.105084,-0.051750,0.075181,0.019386,0.010100,0.081450,-0.059707
391137,2172,H_-0.7_1.3_1.0007.dat,-0.70,1.30,1.000705,1,1,val,0.001298,0.095202,...,-0.001298,-0.095202,-0.063004,-0.050423,-0.100644,0.013617,-0.099841,0.072455,-0.060903,0.103618
158054,878,H_-1.5_1.4_0.000523596.dat,-1.50,1.40,0.000524,0,0,val,-0.001688,0.016006,...,-0.104088,-0.099825,0.103501,0.105384,-0.090431,-0.098232,0.066454,0.079231,-0.034461,-0.050675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119690,664,H_-1.6_-1.2_-0.00325386.dat,-1.60,-1.20,-0.003254,0,0,val,0.105394,-0.068906,...,0.054239,-0.103535,-0.103461,0.083985,0.084219,-0.008860,-0.009245,-0.072129,-0.071846,0.105387
35101,195,H_-1.9_-0.35_-0.000244908.dat,-1.90,-0.35,-0.000245,0,0,val,0.105409,-0.105403,...,0.099052,-0.098655,0.101326,-0.101005,0.103106,-0.102862,0.104383,-0.104218,0.105152,-0.105066
565473,3141,H_-0.1_1.15_1.001.dat,-0.10,1.15,1.000998,1,1,val,0.001540,0.104437,...,0.103529,-0.032204,0.016194,0.101432,-0.104660,0.025124,-0.008889,-0.103186,0.105280,-0.017922
643709,3576,H_0.2_-1.4_1.00095.dat,0.20,-1.40,1.000949,1,1,val,0.105409,-0.062514,...,-0.053043,0.104758,-0.052366,-0.042244,0.105409,-0.062514,-0.053043,0.104758,-0.052366,-0.042244


#### Checking train/val/test splits again

In [9]:
%%time
n_total = len(simulation.dataframe)
n_train = len(simulation.dataframe[simulation.dataframe.type_of == "train"])
n_val = len(simulation.dataframe[simulation.dataframe.type_of == "val"])
n_test = len(simulation.dataframe[simulation.dataframe.type_of == "test"])
n_train_hamiltonians = len(simulation.train_ids)
n_val_hamiltonians = len(simulation.val_ids)
n_test_hamiltonians = len(simulation.test_ids)
n_total_hamiltonians = n_train_hamiltonians + n_val_hamiltonians + n_test_hamiltonians

print("% train: ", n_train/n_total)
print("% val: ",  n_val/n_total)
print("% test: ", n_test/n_total)
print("% train + val + test: ", (n_train+n_val+n_test)/n_total)
print("\n")
print("number of train hamiltonians: ", n_train_hamiltonians)
print("number of val hamiltonians: ", n_val_hamiltonians)
print("number of test hamiltonians: ", n_test_hamiltonians)
print("total number of hamiltonians: ", n_total_hamiltonians)
print("\n")
print("train ids: ", simulation.train_ids)
print("val ids: ", simulation.val_ids)
print("test ids: ", simulation.test_ids)

% train:  0.09358329522938577
% val:  0.8422496570644719
% test:  0.06416704770614236
% train + val + test:  1.0


number of train hamiltonians:  614
number of val hamiltonians:  5526
number of test hamiltonians:  421
total number of hamiltonians:  6561


train ids:  [4175, 3100, 6026, 3821, 556, 2537, 416, 4928, 5216, 5207, 3563, 2247, 4157, 716, 3370, 4286, 2709, 6038, 2243, 4057, 4625, 4105, 6101, 4754, 1934, 1815, 5226, 6449, 5826, 5885, 3057, 1613, 78, 369, 2423, 636, 3329, 697, 2513, 3353, 1300, 1999, 1233, 5050, 444, 154, 3878, 1201, 445, 5794, 5994, 2358, 3108, 1777, 2572, 6198, 2033, 2871, 21, 6271, 1888, 5124, 5716, 1013, 2426, 1817, 3979, 1742, 4053, 5285, 643, 6405, 3157, 1406, 3845, 1497, 2298, 1718, 1122, 3274, 6490, 2516, 363, 3601, 4311, 3591, 2203, 4587, 1907, 2909, 1188, 3101, 5217, 2655, 5999, 612, 3535, 4316, 958, 4406, 5637, 2007, 5111, 5091, 2983, 2344, 1808, 3938, 2614, 1345, 3024, 6237, 5923, 1438, 2975, 2510, 3813, 2904, 4094, 5783, 5876, 5097, 5767, 5092, 4655

#### Checking summaries

In [10]:
%%time
ham_summary = simulation.hamiltonian_summary
print("length of ham_summary: ", len(ham_summary))
ham_summary

length of ham_summary:  6561
CPU times: user 168 µs, sys: 2 µs, total: 170 µs
Wall time: 111 µs


Unnamed: 0_level_0,t1,t2,type_of,0,1,phase,pred_phase
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,-2.0,-2.00,test,0.594444,0.405556,999,0
1,-2.0,-1.95,test,0.761111,0.238889,0,0
2,-2.0,-1.90,test,0.833333,0.166667,0,0
3,-2.0,-1.85,test,0.900000,0.100000,0,0
4,-2.0,-1.80,val,0.988889,0.011111,0,0
...,...,...,...,...,...,...,...
6556,2.0,1.80,val,0.922222,0.077778,0,0
6557,2.0,1.85,test,0.905556,0.094444,0,0
6558,2.0,1.90,test,0.877778,0.122222,0,0
6559,2.0,1.95,test,0.738889,0.261111,0,0


In [11]:
eigen_summary = simulation.eigenvector_summary
print("length of ham_summary: ", len(eigen_summary))
eigen_summary

length of ham_summary:  1180980


Unnamed: 0,id,phase,pred_phase,type_of
0,0,999,1,test
1,0,999,1,test
2,0,999,1,test
3,0,999,1,test
4,0,999,0,test
...,...,...,...,...
1180975,6560,999,0,test
1180976,6560,999,0,test
1180977,6560,999,0,test
1180978,6560,999,1,test


#### Checking accuracies

In [12]:
simulation.accuracy

{'eigenvector_train': 0.989657980456026,
 'eigenvector_val': 0.971964853018056,
 'eigenvector_test': 0.7655555555555555,
 'hamiltonian_train': 1.0,
 'hamiltonian_val': 1.0,
 'hamiltonian_test': 1.0}

#### Checking data stored in  memory

In [13]:
ham_summary_list = simulation.hamiltonian_summary_list
ham_summary_list

[]

In [14]:
eigen_summary_list = simulation.eigenvector_summary_list
eigen_summary_list

[]

In [15]:
accuracy_list = simulation.accuracy_list
accuracy_list

{'eigenvector_train': [],
 'eigenvector_val': [],
 'eigenvector_test': [],
 'hamiltonian_train': [],
 'hamiltonian_val': [],
 'hamiltonian_test': []}