# Generate csv files 


This notebook generates .csv files from a directory containing .dat files with Hamiltonian data. Each .dat file is assumed to be named as `"H_t1_t2_winding.dat"`.  

In [1]:
#from IPython.core.display import HTML
#HTML("<script>Jupyter.notebook.kernel.restart()</script>")
%load_ext autoreload
%autoreload 2

from preprocessing import *

### Set the parameters below carefully

In [2]:
%%time
#grid_folder = "/home/linneu/ssh_grids/ssh1/periodic_100_6561"
grid_folder = "/home/rio/ssh_grids/ssh1/periodic_100_6561"
allowed_windings = [0,1]
epsilon = 0.01
#csv_name = "/home/linneu/ssh_csvs/ssh1/periodic_100_6561.csv"
csv_dir = "/home/rio/ssh_csvs/ssh1"
csv_name = "periodic_100_6561" 


CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.29 µs


In [3]:
%%time
list_of_hamiltonians, raw_data = load_hamiltonians(grid_folder)
print("shape of raw_data: ", raw_data.shape)
print("len list_of_hamiltonians: ", len(list_of_hamiltonians))

dataframe = make_dataframe(raw_data, list_of_hamiltonians, allowed_windings = allowed_windings, epsilon = epsilon, \
                           csv_dir=csv_dir, csv_name = csv_name)

loading hamiltonians: 100%|██████████| 6561/6561 [00:08<00:00, 781.12it/s] 


shape of raw_data:  (656100, 104)
len list_of_hamiltonians:  6561
CPU times: user 1min 11s, sys: 5.09 s, total: 1min 16s
Wall time: 1min 12s


In [4]:
dataframe.info()
dataframe.head()
#dataframe.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656100 entries, 0 to 656099
Columns: 108 entries, id to feat99
dtypes: float64(3), int32(1), int64(2), object(102)
memory usage: 538.1+ MB


Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat90,feat91,feat92,feat93,feat94,feat95,feat96,feat97,feat98,feat99
0,0,H_-2_-2_0.509296.dat,-2.0,-2.0,0.509296,999,666,test,0.1,0.1,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
1,0,H_-2_-2_0.509296.dat,-2.0,-2.0,0.509296,999,666,test,0.141421,0.141142,...,0.114412,0.108967,0.123928,0.119406,0.13149,0.127962,0.136978,0.1345,0.140306,0.138916
2,0,H_-2_-2_0.509296.dat,-2.0,-2.0,0.509296,999,666,test,0.141421,0.140306,...,0.043702,0.0265,0.075777,0.060214,0.103092,0.090145,0.123928,0.114412,0.136978,0.13149
3,0,H_-2_-2_0.509296.dat,-2.0,-2.0,0.509296,999,666,test,-0.1,0.1,...,-0.1,0.1,-0.1,0.1,-0.1,0.1,-0.1,0.1,-0.1,0.1
4,0,H_-2_-2_0.509296.dat,-2.0,-2.0,0.509296,999,666,test,0.000509,-0.008372,...,-0.082713,-0.089753,-0.067684,-0.075347,-0.051587,-0.059753,-0.034677,-0.043217,-0.01722,-0.026


#### Train/test 

In [5]:
n_total = len(dataframe)
n_train = len(dataframe[dataframe.type_of == "train"])
n_test = len(dataframe[dataframe.type_of == "test"])
print("% train: ", n_train/n_total)
print("% test: ", n_test/n_total)
print("% train + test: ", (n_train+n_test)/n_total)

% train:  0.846822130772748
% test:  0.15317786922725193
% train + test:  1.0


#### Unique winding values

In [6]:
np.unique(dataframe.phase.values)

array([  0,   1, 999], dtype=int32)

#### Checking t values are row-major sorted

In [7]:
#np.set_printoptions(threshold=np.inf)
#t_array = dataframe[["t1", "t2"]].values
#print("t_array:\n", t_array)
#np.set_printoptions()

#### Making a grid

In [8]:
#xx = dataframe.t2.values
#yy = dataframe.t1.values
#print("xx: \n", xx)
#print("yy: \n", yy)

In [9]:
# A view of train data
dataframe[dataframe.type_of == "train"].head(1000)

Unnamed: 0,id,path,t1,t2,winding,phase,pred_phase,type_of,feat0,feat1,...,feat90,feat91,feat92,feat93,feat94,feat95,feat96,feat97,feat98,feat99
900,9,H_-2_-1.55_-0.00906169.dat,-2.0,-1.55,-0.009062,0,666,train,0.100000,0.100000,...,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000
901,9,H_-2_-1.55_-0.00906169.dat,-2.0,-1.55,-0.009062,0,666,train,0.100000,-0.100000,...,0.100000,-0.100000,0.100000,-0.100000,0.100000,-0.100000,0.100000,-0.100000,0.100000,-0.100000
902,9,H_-2_-1.55_-0.00906169.dat,-2.0,-1.55,-0.009062,0,666,train,0.141421,0.141209,...,0.114412,0.109682,0.123928,0.120007,0.131490,0.128438,0.136978,0.134844,0.140306,0.139123
903,9,H_-2_-1.55_-0.00906169.dat,-2.0,-1.55,-0.009062,0,666,train,0.141421,0.139516,...,-0.043702,-0.065115,0.008880,-0.014328,0.060214,0.038471,0.103092,0.085867,0.131490,0.121203
904,9,H_-2_-1.55_-0.00906169.dat,-2.0,-1.55,-0.009062,0,666,train,0.141421,0.140572,...,0.043702,0.028721,0.075777,0.062255,0.103092,0.091879,0.123928,0.115729,0.136978,0.132307
905,9,H_-2_-1.55_-0.00906169.dat,-2.0,-1.55,-0.009062,0,666,train,-0.001067,0.006689,...,0.082260,0.088444,0.067193,0.073915,0.051067,0.058221,0.034136,0.041609,0.016666,0.024341
906,9,H_-2_-1.55_-0.00906169.dat,-2.0,-1.55,-0.009062,0,666,train,0.000071,0.015547,...,0.134522,0.138489,0.119444,0.127013,0.096861,0.107556,0.068193,0.081341,0.035239,0.050015
907,9,H_-2_-1.55_-0.00906169.dat,-2.0,-1.55,-0.009062,0,666,train,-0.000462,0.022678,...,0.134642,0.125751,0.141113,0.140740,0.127765,0.135962,0.096472,0.112088,0.051631,0.072472
908,9,H_-2_-1.55_-0.00906169.dat,-2.0,-1.55,-0.009062,0,666,train,-0.100000,0.100000,...,0.100000,-0.100000,-0.100000,0.100000,0.100000,-0.100000,-0.100000,0.100000,0.100000,-0.100000
909,9,H_-2_-1.55_-0.00906169.dat,-2.0,-1.55,-0.009062,0,666,train,0.141421,0.138018,...,-0.114497,-0.129785,-0.060345,-0.086667,0.008736,-0.022110,0.075655,0.047918,0.123859,0.106091


In [10]:
# A view of test data
#dataframe[dataframe.type_of == "test"].head(1000)

#### clearing variables

In [11]:
#del list_of_hamiltonians
#del raw_data
#del dataframe