In [1]:
# 4 wells (6507_2-1, 6507_2-2, 6507_2-4, 6507_3-9S):
# - logs (Density, P-wave, Phi, Vcl, Water Saturation)
# - synthetic-scaled
# - attributes (4)


In [2]:
import pandas as pd
import numpy as np

# for plotting 
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

# define base color
base_color = sb.color_palette()[0]

import glob
import os
import re

## Read data
### Read Logs data

In [3]:
# define file location
logs_location='data/wells/Logs/'
seismic_logs_location='data/wells/Synthetic_seismic/'
att_logs_location='data/wells/Attributes_from_synthetic_traces/'

d2_location='data/2D_1/'
d2_2_location='data/2D_2/'

In [4]:
# logs_1=pd.read_csv(logs_location+'6507_2-1_logs.csv') 
# gives error as we have extra rows with information in our data

In [5]:
# read data with skipping 16 rows of information
# specify column names
logs_1=pd.read_csv(logs_location+'6507_2-1_logs.csv', skiprows=16, 
                   names = ['Time','Density','P_wave','Porosity','V_clay', 'Water Saturation'])

In [6]:
# develop loop to read files in folder 

logs = pd.DataFrame()

for file_name in glob.glob(logs_location+'*.csv'):
    # read file
    df = pd.read_csv(file_name, skiprows=16, 
                     names = ['Time','Density', 'P_wave','Porosity','V_clay', 'Water Saturation'],
                     low_memory=False)
    
    # add log ID to new column
    df['log_ID']=file_name[16:-9]
    
    # add last file to df
    logs = pd.concat([logs,df],axis=0)

# reset index
logs.reset_index(drop=True, inplace=True)

In [7]:
# -999.2500 is equivalent to NaN value
(logs==-999.2500).sum()

Time                    0
Density               935
P_wave              19293
Porosity            28608
V_clay              16457
Water Saturation    27953
log_ID                  0
dtype: int64

In [8]:
# drop all data contailing -999.2500
logs.drop(logs.index[logs['Porosity'] == -999.2500], inplace = True)
logs.drop(logs.index[logs['P_wave'] == -999.2500], inplace = True)
logs.drop(logs.index[logs['Density'] == -999.2500], inplace = True)
logs.drop(logs.index[logs['V_clay'] == -999.2500], inplace = True)

# re-index dataframe
logs.reset_index(drop=True, inplace=True)

In [9]:
logs['log_ID'].value_counts()

6507_2-1     26600
6507_2-4     20025
6507_2-2     16803
6507_3-9S    11325
6507_6-4A      563
6507_5-5       326
6507_3-6       116
Name: log_ID, dtype: int64

In [10]:
logs.describe()

Unnamed: 0,Time,Density,P_wave,Porosity,V_clay,Water Saturation
count,75758.0,75758.0,75758.0,75758.0,75758.0,75758.0
mean,2231.724908,2.259806,2623.788348,0.131551,0.643055,0.99222
std,792.450806,0.254068,625.932406,0.052663,0.139405,0.067032
min,514.8757,1.2542,69.0955,0.0,0.0,0.0
25%,1623.530175,2.0852,2170.0437,0.0882,0.6133,1.0
50%,2295.5115,2.2585,2485.396,0.1213,0.6676,1.0
75%,2866.009375,2.4816,2997.033275,0.1627,0.7179,1.0
max,3693.6333,3.069,6082.1841,0.5738,1.0,1.0


### Read synthetic seismic wells

In [11]:
seismic = pd.DataFrame()

for file_name in glob.glob(seismic_logs_location+'*'):
    # read file,  skip 1st row, specify column names
    df=pd.read_table(file_name,skiprows=1, delim_whitespace=True, 
                     names=('Time', 'Seismic'))
    # add log ID
    df['log_ID']=file_name[29:-17]
    
    seismic = pd.concat([seismic,df],axis=0)

# drop columns with NaN
seismic.dropna(axis=1, inplace=True)

# reset index
seismic.reset_index(drop=True, inplace=True)

### Read attributes from synthetic traces 

In [12]:
p = re.compile(r'\d+_\d-\dS*')
p1 = re.compile(r'[A-Z][A-Za-z]+')

seismic_att=pd.DataFrame()
seismic_att=pd.DataFrame()

for root, dirs, files in os.walk(att_logs_location):
    for d in dirs:
        # for each sub directory
        count=0
        for root_1, dirs_1, files_1 in os.walk(os.path.join(root, d)):
            for file in files_1:
                
                # find attribute name from file_name string
                column_name=p1.findall(file)[0]
                
                # read file
                df=pd.read_table(os.path.join(root, d, file),skiprows=1, delim_whitespace=True, 
                     names=('Time', column_name))
                
                # add log ID column
                df['log_ID']=p.findall(file)[0]
                
                if count==0:
                    seismic_att1 = df
                    count=count+1
                else:                    
                    seismic_att1 = pd.merge(seismic_att1,df,on=['Time', 'log_ID'])
                        
        seismic_att=pd.concat([seismic_att,seismic_att1],axis=0)
        

In [13]:
# merge seismic and seismic_attributes dataframes
seismic_df=pd.merge(seismic,seismic_att,on=['Time', 'log_ID'])

### Read 2D data for exercise 1

In [14]:
p1 = re.compile(r'[A-Z][A-Za-z]+')

count=0
for file in glob.glob(d2_location+'*'):
    # find attribute name from file_name string
    column_name=p1.findall(file)[0]
                
    # read file
    df=pd.read_table(file,skiprows=0, delim_whitespace=True, 
                    names=('Time', column_name, 'X', 'Trace'))
    # fill in trace nr
    df['Trace'].fillna(method='ffill', inplace=True)

    if count==0:
        section_stack=df
        count=count+1
    else:
        section_stack=pd.merge(section_stack, df, on=['Time', 'X', 'Trace']) 
        
# remove rows that contain '-> Trace # XXX'
section_stack.drop(section_stack[section_stack['X']=='#'].index, inplace=True)
# remocve column 'X'
section_stack.drop('X', axis=1, inplace=True)
# reset index
section_stack.reset_index(drop=True, inplace=True)

### Read 2D data for exercise 2

In [15]:
count=0
for file in glob.glob(d2_2_location+'*'):
    # find attribute name from file_name string
    column_name=file[10:-8]
                
    # read file
    df=pd.read_table(file,skiprows=0, delim_whitespace=True, 
                    names=('Time', column_name, 'X', 'Trace'))
    # fill in trace nr
    df['Trace'].fillna(method='ffill', inplace=True)

    if count==0:
        section_stack2=df
        count=count+1
    else:
        section_stack2=pd.merge(section_stack2, df, on=['Time', 'X', 'Trace'], how='outer') 
        
# remove rows that contain '-> Trace # XXX'
section_stack2.drop(section_stack2[section_stack2['X']=='#'].index, inplace=True)
# remove column 'X'
section_stack2.drop('X', axis=1, inplace=True)
# reset index
section_stack2.reset_index(drop=True, inplace=True)

## Rename columns

In [16]:
# check if you need to rename some columns
print('logs: ', logs.columns, '\n')
print ('seismic_df: ', seismic_df.columns, '\n')
print ('section_stack: ', section_stack.columns, '\n')
print ('section_stack2: ', section_stack2.columns, '\n')

# find deffierence in column names
print ('log and section_stack: ', logs.columns.difference(section_stack.columns), '\n')
print('seismic_df and section_stack: ', (seismic_df.columns).difference(section_stack.columns), '\n')
print ('log and section_stack2: ', logs.columns.difference(section_stack2.columns), '\n')
print('section_stack and section_stack2: ', (section_stack.columns).difference(section_stack2.columns), '\n')

logs:  Index(['Time', 'Density', 'P_wave', 'Porosity', 'V_clay', 'Water Saturation',
       'log_ID'],
      dtype='object') 

seismic_df:  Index(['Time', 'Seismic', 'log_ID', 'QuadTrace', 'SecDerInstAmpl',
       'Intergated', 'AWCP', 'Der', 'AWF', 'DomF', 'InstF', 'SecDer', 'AmpEn',
       'AppPolr'],
      dtype='object') 

section_stack:  Index(['Time', 'DomF', 'Trace', 'AppPolr', 'Seismic', 'Der', 'QuadrTrace',
       'SecDerInstAmpl', 'Integrated', 'AmpEnv', 'Velocity', 'Density', 'AWF',
       'SecDer', 'AWCP', 'InstF'],
      dtype='object') 

section_stack2:  Index(['Time', 'P-imp', 'Trace', 'Porosity', 'Dn', 'Vcl', 'P-wave', 'Seismic'], dtype='object') 

log and section_stack:  Index(['P_wave', 'Porosity', 'V_clay', 'Water Saturation', 'log_ID'], dtype='object') 

seismic_df and section_stack:  Index(['AmpEn', 'Intergated', 'QuadTrace', 'log_ID'], dtype='object') 

log and section_stack2:  Index(['Density', 'P_wave', 'V_clay', 'Water Saturation', 'log_ID'], dtype='object') 



In [17]:
# rename columns
section_stack.rename(columns={'Velocity':'P_wave'}, inplace=True)
seismic_df.rename(columns={'Intergated':'Integrated', 
                           'AmpEn':'AmpEnv', 
                           'QuadTrace':'QuadrTrace'}, inplace=True)

section_stack.rename(columns={'Dn':'Density', 
                              'P-wave':'P_wave', 
                              'Vcl':'V_clay'}, inplace=True)

## Add horizons
Horizon| 6507/3-9s | 6507/2-4 | 6507/2-2 | 6507/2-1
---|---|---|---|---
2D-wb| 488 |492 |516 | 512
2D-mid | 1835 |1827 |1831| 1831
2D-mid2 | 2714 |2726 |2694 |2830
2D-K65 | 3022 |3082 |3050 |3169
2D-BCU |3241 |3233 |3110 |3257
2D-bottom | Time_max | | |

In [18]:
bottom=max(logs['Time'].max(), seismic_df['Time'].max())
horison_logs=pd.DataFrame(np.array([['2D-wd', 488, 492, 516, 512], ['2D-mid', 1835, 1827, 1831, 1831], 
                      ['2D-mid2', 2714, 2726, 2694, 2830], ['2D-K65', 3022, 3082, 3050, 3169], 
                      ['2D-BCU', 3241, 3233, 3110, 3257], ['2D-bottom', bottom, bottom, bottom, bottom]]),
             columns=['Horison', '6507_3-9S', '6507_2-4', '6507_2-2', '6507_2-1'])

# reshape df (unstack)
horison_logs=horison_logs.melt(id_vars='Horison', var_name='log_ID', value_name='Time')
horison_logs['Time']=horison_logs['Time'].astype(float)

In [19]:
# combine logs + horisons

# add horisons 
logs=pd.concat([logs, horison_logs], join='outer').sort_values(['log_ID', 'Time']).reset_index(drop=True)

# backward fill horisons missing values
logs['Horison']=logs['Horison'].fillna(method='bfill')

logs=logs[~logs['P_wave'].isna()]

In [20]:
horizons=pd.read_csv('Data/2D_horizons.csv', skiprows=10)

horizons=horizons.rename(columns={'<CDP>': 'CDP',
                         '<Domain: 2D-': '2D-BCU',
                          'BCU> <Domain:': '2D-bottom',
                          '2D-bottom> <Domain': '2D-K65',
                          ': 2D-K65> <Dom':'2D-mid', 
                          'D-mid> <Doma':'2D-mid2',
                          'in: 2D-mid2> <':'2D-wb'})

horizons=horizons.drop(['ain: 2','Domain: 2D-wb>' ], axis=1)
# add bottom value 
horizons['bottom']=5000

In [21]:
horizons

Unnamed: 0,CDP,2D-BCU,2D-bottom,2D-K65,2D-mid,2D-mid2,2D-wb,bottom
0,1,2466.5352,3078.6089,2460.0000,1724.6871,2411.2976,509.44284,5000
1,2,2466.5352,3079.8782,2460.0000,1724.2711,2410.4529,509.44284,5000
2,3,2466.5352,3081.0125,2460.0000,1725.1711,2410.4529,509.44284,5000
3,4,2485.7712,3083.6541,2465.4553,1725.8091,2415.5444,509.08835,5000
4,5,2505.0073,3085.8909,2471.8271,1727.4093,2420.6357,508.96198,5000
...,...,...,...,...,...,...,...,...
789,790,3577.1155,4618.8940,3257.0779,1808.4211,2861.4380,433.26376,5000
790,791,3579.8704,4623.3291,3259.6245,1809.7360,2862.6533,437.00485,5000
791,792,3582.6707,4627.3184,3260.4734,1805.0125,2864.1213,439.36432,5000
792,793,3584.2341,4629.3965,3262.8542,1804.6062,2864.9619,440.02606,5000


In [22]:
# rearrange dataframe 
# Trace 1 - hor 1 - value
# Trace 1 - hor 2 - value
horison_unstacked=pd.DataFrame(horizons.set_index('CDP').stack()).reset_index()
# rename columns
horison_unstacked=horison_unstacked.rename(columns={'CDP':'Trace','level_1':'Horison', 0:'Time'})
# sort by Trace name and Time 
horison_unstacked=horison_unstacked.sort_values(['Trace','Time']).reset_index(drop=True)

In [23]:
section_stack['Time']=section_stack.Time.astype(float)

In [24]:
# add horisons 
section_stack=pd.merge(section_stack, horison_unstacked, how='outer', on=['Trace', 'Time']).sort_values(['Trace', 'Time'])

# backward fill horisons missing values
section_stack['Horison']=section_stack['Horison'].fillna(method='bfill')

# drop all missing values
section_stack=section_stack[~section_stack['Density'].isna()]

## Save dataframes to csv files

In [25]:
logs.to_csv('data/logs.csv',index=False)

In [26]:
seismic_df.to_csv('data/seismic_logs.csv',index=False)

In [27]:
section_stack.to_csv('data/section_2D_1.csv',index=False)

In [28]:
section_stack2.to_csv('data/section_2D_2.csv',index=False)