# Generate csv files 


This notebook generates .csv files from a directory containing .dat files with Hamiltonian data. Each .dat file is assumed to be named as `"H_t1_t2_winding.dat"`.  

In [6]:
#from IPython.core.display import HTML
#HTML("<script>Jupyter.notebook.kernel.restart()</script>")
%load_ext autoreload
%autoreload 2

from preprocessing import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Set the parameters below carefully

In [7]:
%%time
grid_folder = "/home/linneu/ssh_grids/ssh1/periodic_100_6561"
allowed_windings = [0,1]
epsilon = 0.01
csv_name = "/home/linneu/ssh_csvs/ssh1/periodic_100_6561.csv"
to_csv = True


CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.68 µs


In [8]:
%%time
list_of_hamiltonians, raw_data = load_hamiltonians(grid_folder)
print("shape of raw_data: ", raw_data.shape)
print("len list_of_hamiltonians: ", len(list_of_hamiltonians))

dataframe = make_dataframe(raw_data, list_of_hamiltonians, allowed_windings = allowed_windings, epsilon = epsilon, \
                           csv_name = csv_name, to_csv = to_csv)

loading hamiltonians: 100%|██████████| 6561/6561 [00:11<00:00, 574.05it/s]


shape of raw_data:  (656100, 104)
len list_of_hamiltonians:  6561
CPU times: user 1min 22s, sys: 7.05 s, total: 1min 29s
Wall time: 1min 28s


In [9]:
dataframe.info()
dataframe.head()
#dataframe.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656100 entries, 0 to 656099
Columns: 108 entries, id to feat99
dtypes: float64(3), int32(1), int64(2), object(102)
memory usage: 538.1+ MB


Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat90,feat91,feat92,feat93,feat94,feat95,feat96,feat97,feat98,feat99
0,0,H_-2_-2_0.509296.dat,-2.0,-2.0,0.509296,999,666,test,0.1,0.1,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
1,0,H_-2_-2_0.509296.dat,-2.0,-2.0,0.509296,999,666,test,0.141421,0.141142,...,0.114412,0.108967,0.123928,0.119406,0.13149,0.127962,0.136978,0.1345,0.140306,0.138916
2,0,H_-2_-2_0.509296.dat,-2.0,-2.0,0.509296,999,666,test,0.141421,0.140306,...,0.043702,0.0265,0.075777,0.060214,0.103092,0.090145,0.123928,0.114412,0.136978,0.13149
3,0,H_-2_-2_0.509296.dat,-2.0,-2.0,0.509296,999,666,test,-0.1,0.1,...,-0.1,0.1,-0.1,0.1,-0.1,0.1,-0.1,0.1,-0.1,0.1
4,0,H_-2_-2_0.509296.dat,-2.0,-2.0,0.509296,999,666,test,0.000509,-0.008372,...,-0.082713,-0.089753,-0.067684,-0.075347,-0.051587,-0.059753,-0.034677,-0.043217,-0.01722,-0.026


#### Checking t values are row-major sorted

In [10]:
#np.set_printoptions(threshold=np.inf)
t_array = dataframe[["t1", "t2"]].values
print("t_array:\n", t_array)
np.set_printoptions()

t_array:
 [[-2. -2.]
 [-2. -2.]
 [-2. -2.]
 ...
 [ 2.  2.]
 [ 2.  2.]
 [ 2.  2.]]


#### Making a grid

In [11]:
xx = dataframe.t2.values
yy = dataframe.t1.values
print("xx: \n", xx)
print("yy: \n", yy)

xx: 
 [-2. -2. -2. ...  2.  2.  2.]
yy: 
 [-2. -2. -2. ...  2.  2.  2.]


#### Train/test 

In [12]:
n_total = len(dataframe)
n_train = len(dataframe[dataframe.type_of == "train"])
n_test = len(dataframe[dataframe.type_of == "test"])
print("% train: ", n_train/n_total)
print("% test: ", n_test/n_total)
print("% train + test: ", (n_train+n_test)/n_total)

% train:  0.846822130772748
% test:  0.15317786922725193
% train + test:  1.0


In [13]:
# A view of train data
dataframe[dataframe.type_of == "train"].head(1000)

Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat90,feat91,feat92,feat93,feat94,feat95,feat96,feat97,feat98,feat99
900,9,H_-2_-1.55_-0.00906169.dat,-2.0,-1.55,-0.009062,0,666,train,0.100000,0.100000,...,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000
901,9,H_-2_-1.55_-0.00906169.dat,-2.0,-1.55,-0.009062,0,666,train,0.100000,-0.100000,...,0.100000,-0.100000,0.100000,-0.100000,0.100000,-0.100000,0.100000,-0.100000,0.100000,-0.100000
902,9,H_-2_-1.55_-0.00906169.dat,-2.0,-1.55,-0.009062,0,666,train,0.141421,0.141209,...,0.114412,0.109682,0.123928,0.120007,0.131490,0.128438,0.136978,0.134844,0.140306,0.139123
903,9,H_-2_-1.55_-0.00906169.dat,-2.0,-1.55,-0.009062,0,666,train,0.141421,0.139516,...,-0.043702,-0.065115,0.008880,-0.014328,0.060214,0.038471,0.103092,0.085867,0.131490,0.121203
904,9,H_-2_-1.55_-0.00906169.dat,-2.0,-1.55,-0.009062,0,666,train,0.141421,0.140572,...,0.043702,0.028721,0.075777,0.062255,0.103092,0.091879,0.123928,0.115729,0.136978,0.132307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895,18,H_-2_-1.1_-0.00325923.dat,-2.0,-1.10,-0.003259,0,666,train,0.141421,0.135641,...,-0.043989,-0.079976,0.076032,0.106469,-0.103298,-0.126273,0.124074,0.138142,-0.137053,-0.141332
1896,18,H_-2_-1.1_-0.00325923.dat,-2.0,-1.10,-0.003259,0,666,train,-0.000125,0.077667,...,-0.134461,-0.136402,0.109047,0.041557,0.017601,0.091867,-0.127909,-0.140007,0.119473,0.058172
1897,18,H_-2_-1.1_-0.00325923.dat,-2.0,-1.10,-0.003259,0,666,train,-0.001253,0.038525,...,0.134882,0.117508,-0.120073,-0.094248,0.097719,0.065065,-0.069226,-0.031794,0.036383,-0.003474
1898,18,H_-2_-1.1_-0.00325923.dat,-2.0,-1.10,-0.003259,0,666,train,0.000341,0.077486,...,0.134394,-0.136459,-0.109184,0.041764,-0.017387,0.091702,0.127816,-0.140037,-0.119588,0.058369


In [14]:
# A view of test data
dataframe[dataframe.type_of == "test"].head(1000)

Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat90,feat91,feat92,feat93,feat94,feat95,feat96,feat97,feat98,feat99
0,0,H_-2_-2_0.509296.dat,-2.0,-2.0,0.509296,999,666,test,0.100000,0.100000,...,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000
1,0,H_-2_-2_0.509296.dat,-2.0,-2.0,0.509296,999,666,test,0.141421,0.141142,...,0.114412,0.108967,0.123928,0.119406,0.131490,0.127962,0.136978,0.134500,0.140306,0.138916
2,0,H_-2_-2_0.509296.dat,-2.0,-2.0,0.509296,999,666,test,0.141421,0.140306,...,0.043702,0.026500,0.075777,0.060214,0.103092,0.090145,0.123928,0.114412,0.136978,0.131490
3,0,H_-2_-2_0.509296.dat,-2.0,-2.0,0.509296,999,666,test,-0.100000,0.100000,...,-0.100000,0.100000,-0.100000,0.100000,-0.100000,0.100000,-0.100000,0.100000,-0.100000,0.100000
4,0,H_-2_-2_0.509296.dat,-2.0,-2.0,0.509296,999,666,test,0.000509,-0.008372,...,-0.082713,-0.089753,-0.067684,-0.075347,-0.051587,-0.059753,-0.034677,-0.043217,-0.017220,-0.026000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7895,78,H_-2_1.9_0.0349338.dat,-2.0,1.9,0.034934,0,666,test,-0.141421,0.115664,...,0.141421,-0.115664,-0.044160,0.113134,-0.114128,0.045743,0.114695,-0.141405,0.043243,0.041650
7896,78,H_-2_1.9_0.0349338.dat,-2.0,1.9,0.034934,0,666,test,0.001979,0.129874,...,-0.001979,-0.129874,0.081516,-0.137969,0.133875,-0.093364,0.135098,-0.013097,0.084718,0.072172
7897,78,H_-2_1.9_0.0349338.dat,-2.0,1.9,0.034934,0,666,test,-0.141421,-0.116087,...,0.141421,0.116087,-0.043458,-0.112689,-0.114563,-0.046441,0.114261,0.141392,0.043945,-0.040944
7898,78,H_-2_1.9_0.0349338.dat,-2.0,1.9,0.034934,0,666,test,0.000057,0.081027,...,-0.000057,-0.081027,-0.134482,-0.085197,0.083171,0.133681,0.083079,0.002577,-0.134517,-0.135274


#### Unique winding values

In [15]:
np.unique(dataframe.phase.values)

array([  0,   1, 999], dtype=int32)