# Merge all csv files

In [319]:
# CSV Merge
import pandas as pd
import glob
import os

# merging the files
files_joined = os.path.join('../STED', "*.csv")

# Return a list of all joined files
list_files = glob.glob(files_joined)

print("** Merging multiple csv files into a single pandas dataframe **")

# Merge files by joining all files
df = pd.concat(map(pd.read_csv, list_files), ignore_index=False)
print("The number of dataframe: ", len(df))

** Merging multiple csv files into a single pandas dataframe **


  sort=sort,


The number of dataframe:  1265657


In [321]:
modified_file_csv = "merge_chunks.csv"
df.to_csv(modified_file_csv, index=None)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1265657 entries, 0 to 235425
Data columns (total 35 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   network_code                      1265613 non-null  object 
 1   receiver_code                     1265657 non-null  object 
 2   receiver_type                     1265657 non-null  object 
 3   receiver_latitude                 1265657 non-null  float64
 4   receiver_longitude                1265657 non-null  float64
 5   receiver_elevation_m              1265657 non-null  float64
 6   p_arrival_sample                  1030231 non-null  float64
 7   p_status                          1030231 non-null  object 
 8   p_weight                          1030057 non-null  float64
 9   p_travel_sec                      1030231 non-null  float64
 10  s_arrival_sample                  1030231 non-null  float64
 11  s_status                          1030

# Lighten up data

In [1]:
import pandas as pd
import h5py
import numpy as np
import matplotlib.pyplot as plt

file_name = "/home/STED/chunk2.hdf5"
csv_file = "/home/STED/chunk2.csv"

df = pd.read_csv(csv_file)
print(f'total events in csv file: {len(df)}')

# To distinguish between small earthquakes and noise
df = df[((df.trace_category == 'earthquake_local') & (df.source_magnitude <= 2)) | (df.trace_category == 'noise')]
print(f'total events selected: {len(df)}')
df.to_csv("/home/STED/chunk2_modified.csv")

ev_list = df['trace_name'].to_list()

dtfl = h5py.File(file_name, 'r')
processed = h5py.File("/home/STED/chunk2_modified.hdf5", 'w')
grp = processed.create_group("data")

for c, evi in enumerate(ev_list):
    dataset = dtfl.get('data/'+str(evi)) 
    dst = grp.create_dataset(str(evi), data=dataset)
    
    for key, value in dataset.attrs.items():
        dst.attrs[key] = value
        
    if c % 1000 == 0:
        print(str(c) + " / " + str(len(ev_list)))

dtfl.close()

  interactivity=interactivity, compiler=compiler, result=result)


total events in csv file: 200000
total events selected: 165214
0 / 165214
1000 / 165214
2000 / 165214
3000 / 165214
4000 / 165214
5000 / 165214
6000 / 165214
7000 / 165214
8000 / 165214
9000 / 165214
10000 / 165214
11000 / 165214
12000 / 165214
13000 / 165214
14000 / 165214
15000 / 165214
16000 / 165214
17000 / 165214
18000 / 165214
19000 / 165214
20000 / 165214
21000 / 165214
22000 / 165214
23000 / 165214
24000 / 165214
25000 / 165214
26000 / 165214
27000 / 165214
28000 / 165214
29000 / 165214
30000 / 165214
31000 / 165214
32000 / 165214
33000 / 165214
34000 / 165214
35000 / 165214
36000 / 165214
37000 / 165214
38000 / 165214
39000 / 165214
40000 / 165214
41000 / 165214
42000 / 165214
43000 / 165214
44000 / 165214
45000 / 165214
46000 / 165214
47000 / 165214
48000 / 165214
49000 / 165214
50000 / 165214
51000 / 165214
52000 / 165214
53000 / 165214
54000 / 165214
55000 / 165214
56000 / 165214
57000 / 165214
58000 / 165214
59000 / 165214
60000 / 165214
61000 / 165214
62000 / 165214
63000

## 서버 Memory 부족 문제로 데이터셋의 수를 줄임

In [2]:
import pandas as pd
import h5py
import numpy as np
import matplotlib.pyplot as plt
import random
import csv

file_name = "../STED/chunk2_modified.hdf5"
csv_file = "../STED/chunk2_modified.csv"
modified_file_name = "../STED/new_chunk2_modified.hdf5"
modified_file_csv = "../STED/new_chunk2_modified.csv"


df = pd.read_csv(csv_file, index_col=0)

num = list(range(1000))
random.shuffle(num)

new_df = df[0:1]

for c, i in enumerate(num):
    new_df = new_df.append(df[i:i+1])
    if c % 1000 == 0:
        print(str(c) + " / " + str(len(num)))

new_df=new_df[1:]
new_df.to_csv(modified_file_csv)

ev_list = new_df['trace_name'].to_list()

dtfl = h5py.File(file_name, 'r')
processed = h5py.File(modified_file_name, 'w')
grp = processed.create_group("data")

for c, evi in enumerate(ev_list):
    dataset = dtfl.get('data/'+str(evi)) 
    dst = grp.create_dataset(str(evi), data=dataset)
    
    for key, value in dataset.attrs.items():
        dst.attrs[key] = value
        
    if c % 1000 == 0:
        print(str(c) + " / " + str(len(ev_list)))

dtfl.close()
processed.close()

  interactivity=interactivity, compiler=compiler, result=result)


0 / 1000
0 / 1000
