# Quato data

`csv file` containing filename entries, folders and other metadata

We start by loading this file:

In [7]:
import pandas as pd
import numpy as np
import os
df = pd.read_csv('data/largeDoseOfTurbiniumForQuato_reduced.csv')
pd.set_option('display.max_colwidth', -1)

### show first few lines

In [8]:
df.head()

Unnamed: 0,file,directory,date_logged,log_name,contractor,data_type,log_service,log_activity,log_type,log_job,log_run,log_pass,casing_size_manual,section_size,station_number,station_depth,mfc_correction,interpretation_number,date_stamp
0,CONTENT-MUD_LOG_1.DLIS-112806.TXT,/project/recall/prod/arc/norway_a/files/GRANE/NO_25_11_G_19_EY2T2,2014/08/18,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:42:19
1,MUD_LOG_1_INF_1-112807.ASC,/project/recall/prod/arc/norway_a/files/GRANE/NO_25_11_G_19_EY2T2,2014/08/18,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:43:09
2,CONTENT-MUD_LOG_1.DLIS-112797.TXT,/project/recall/prod/arc/norway_a/files/GRANE/NO_25_11_G_19_EY2,2014/08/17,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:31:48
3,MUD_LOG_1_INF_1-112798.ASC,/project/recall/prod/arc/norway_a/files/GRANE/NO_25_11_G_19_EY2,2014/08/17,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:32:29
4,CONTENT-MUD_LOG_1.DLIS-112715.TXT,/project/recall/prod/arc/norway_a/files/GRANE/NO_25_11_G_19_A,2014/03/31,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:22:39:44


### Features

file directory

### Labels

date_logged	log_name	contractor	data_type	log_service	log_activity	log_type	log_job	log_run	log_pass	casing_size_manual	section_size	station_number	station_depth	mfc_correction	interpretation_number	date_stamp

### Clean and rename columns for clarity sake

In [9]:
COLUMN_MAPPING = {
        'directory': 'file_directory',
        'file': 'filename',
        'contractor': 'company',
        'log_service': 'service',
        'log_activity': 'activity',
    }
df.columns = [col.strip() for col in df.columns]
 # column names have lots of surrounding whitepaces
df.columns = [col.strip() for col in df.columns]

# Ensure every string is stripped of random whitepaces.
df = df.applymap(lambda val: val.strip() if hasattr(val, 'strip') else val)

# Rename columns
df = df.rename(columns=COLUMN_MAPPING)

# Fill remaining NaNs with empty target
df = df.fillna(value='')

In [10]:
df.head()

Unnamed: 0,filename,file_directory,date_logged,log_name,company,data_type,service,activity,log_type,log_job,log_run,log_pass,casing_size_manual,section_size,station_number,station_depth,mfc_correction,interpretation_number,date_stamp
0,CONTENT-MUD_LOG_1.DLIS-112806.TXT,/project/recall/prod/arc/norway_a/files/GRANE/NO_25_11_G_19_EY2T2,2014/08/18,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:42:19
1,MUD_LOG_1_INF_1-112807.ASC,/project/recall/prod/arc/norway_a/files/GRANE/NO_25_11_G_19_EY2T2,2014/08/18,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:43:09
2,CONTENT-MUD_LOG_1.DLIS-112797.TXT,/project/recall/prod/arc/norway_a/files/GRANE/NO_25_11_G_19_EY2,2014/08/17,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:31:48
3,MUD_LOG_1_INF_1-112798.ASC,/project/recall/prod/arc/norway_a/files/GRANE/NO_25_11_G_19_EY2,2014/08/17,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:32:29
4,CONTENT-MUD_LOG_1.DLIS-112715.TXT,/project/recall/prod/arc/norway_a/files/GRANE/NO_25_11_G_19_A,2014/03/31,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:22:39:44


### Test and train data split

In [11]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

### Join filename and filedirectory into a single string

In [12]:
#a new dataframe with just the feature data
X_train = pd.DataFrame()
X_test = pd.DataFrame()
X_train['filename'] = train.apply(lambda line: os.path.join(line['file_directory'], line['filename']), axis=1)
X_test['filename'] = test.apply(lambda line: os.path.join(line['file_directory'], line['filename']), axis=1)

### Get company as label

In [13]:
#y_train =train['data_type'].astype(str)
#y_test = test['data_type'].astype(str)
# y_train =train['company'].astype(str)
# y_test = test['company'].astype(str)
# y_train = train.drop(labels = ['file_directory','filename'], axis=1)
# y_test = train.drop(labels = ['file_directory','filename'], axis=1)
labels = ['company','data_type','log_name']
y_train = train.loc[:, labels]
y_test = test.loc[:, labels]
y_train = y_train.astype(str)
y_test = y_test.astype(str)
print(X_train.head(), y_train.head())
y_train.head()

                                                                                                                              filename
7034  /project/recall/prod/arc/norway_a/tapes/STATFJORD/NO_33_12_B_34_A/WL_RAW_BHPR-GR-MECH_TIME_MWD_1-54609.las                      
5901  /project/recall/prod/arc/norway_a/films/STATFJORD/NO_33_9_C_38_D/Basic_Channel_Description-54296.pdf                            
9738  /project/recall/prod/arc/norway_a/films/GULLFAKS_S_R/NO_34_10_O_2_H/Gullfaks_Sor_34_10-O-2_H_Bit_#2_17.5x20in_303_803m-70588.pdf
8948  /project/recall/prod/arc/norway_a/films/GULLFAKS/NO_34_10_A_18_AT2/WL_RAW_BHPR-GR-MECH_TIME_MWD_PLOT_6-80637.Pdf                
8255  /project/recall/prod/arc/norway_a/films/GULLFAKS/NO_34_10_C_41_A/WL_RAW_PROD_CAL-FLOW-CAS-GEOM_2015-10-20_PLOT_1-51219.PDF                  company data_type   log_name
7034  SCHLUMBERGER  RAW       AUTOMATIC
5901  SCHLUMBERGER  RAW       AUTOMATIC
9738  GEOSERVICES   RAW       AUTOMATIC
8948  SCHLUMBERGER  RAW       

Unnamed: 0,company,data_type,log_name
7034,SCHLUMBERGER,RAW,AUTOMATIC
5901,SCHLUMBERGER,RAW,AUTOMATIC
9738,GEOSERVICES,RAW,AUTOMATIC
8948,SCHLUMBERGER,RAW,AUTOMATIC
8255,SCHLUMBERGER,RAW,AUTOMATIC


### Process filename inputs (for both train and test data)

In [17]:
# define clearing regex
import re
REGEX_LAST_NUM = re.compile('([-|_]\d+\.)')
REGEX_FIRST_EXCLAMATION = re.compile('(^\d+!\d+[_|-]+)')
REGEX_OTHER_EXCLAMATION = re.compile('([_|-]+\d+!\d+[_|-]+)')
REGEX_STAT = re.compile(r'(?<!STAT)_', re.I)

In [18]:
#split filename into independent chunks
def file_path_formatter(string):
        # remove last number before extension
        # 6506!12-n-4 ah stat_cpi-117029.las -> 6506!12-n-4 ah stat_cpi.las
        string = REGEX_LAST_NUM.sub('.', string)

        # remove first number containing exclamation mark
        # 6506!12-n-4 ah stat_cpi.las -> n-4 ah stat_cpi.las
        string = REGEX_FIRST_EXCLAMATION.sub('', string)

        # remove other number containing exclamation mark
        # n-4 ah stat_cpi.las -> n 4 ah stat_cpi las
        string = REGEX_OTHER_EXCLAMATION.sub(' ', string)

        string = string.replace('-', ' ').replace('.', ' ').replace(',', ' ').replace('/', ' ').replace('_',' ')
        return string

#X_train['filename'] = X_train['filename'].apply(file_path_formatter)
#X_test['filename'] = X_test['filename'].apply(file_path_formatter)


### Extract features using CountVectorizer

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

feat_t = Pipeline([
    ('ft',CountVectorizer(ngram_range=(1, 3), 
                                    lowercase=True,
                                    preprocessor=file_path_formatter,
                                    min_df=int(2),
                                    analyzer='word')),
    ('tfid',TfidfTransformer())])
count_vectorizer = feat_t

In [20]:
# creates the document term matrix
count_train = count_vectorizer.fit_transform(X_train['filename'])
count_test = count_vectorizer.transform(X_test['filename'])

### Define ML model that currently is RandomForest as it performs the best

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputRegressor
# Parameters for the model
max_depth=None
n_estimators=82
min_samples_split=10
max_features='sqrt'
rf_classifier = RandomForestClassifier(max_depth=None, 
                                       n_jobs=-1,
                                       n_estimators=n_estimators,
                                       min_samples_split=min_samples_split,
                                       max_features=max_features)
mor = MultiOutputRegressor(rf_classifier)

# Train ....

In [22]:
print(count_train.shape)
mor.fit(count_train, y_train)

(11832, 13135)


MultiOutputRegressor(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=82, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=None)

# Predict on test data

In [23]:
pred = mor.predict(count_test)

In [24]:
from sklearn.metrics import accuracy_score
# print('Test accuracy is {}'.format(accuracy_score(y_test, pred)))
pred

array([['GEOSERVICES', 'RAW', 'AUTOMATIC'],
       ['BAKER HUGHES', 'RAW', 'AUTOMATIC'],
       ['BAKER HUGHES', 'RAW', 'AUTOMATIC'],
       ...,
       ['BAKER HUGHES', 'RAW', 'AUTOMATIC'],
       ['SCHLUMBERGER', 'RAW', 'AUTOMATIC'],
       ['SCHLUMBERGER', 'RAW', 'AUTOMATIC']], dtype=object)

In [25]:
# y_test.value_counts()

# Show some results

In [26]:
test_set = pd.DataFrame(data={'filename':[
    '/project/recall/prod/arc/norway_a/tapes/GRANE/NO_25_11_G_31/WLC_RAW_BHPR-GR-MECH-REMP_MWD_1-69463.DLIS',
    '/project/recall/prod/arc/norway_a/tapes/GRANE/NO_25_11_G_25_AY2/WLC_COMPOSITE_1-52573.DLIS',
    '/project/recall/prod/arc/norway_a/tapes/GRANE/NO_25_11_G_22_BY1T2/WLC_PETROPHYSICAL_COMPOSITE_1-66518.DLIS',
    'WLC_RAW_BHPR-GR-MECH-REMP_MWD_1-69463.DLIS',
    'WLC_COMPOSITE_1-52573.DLIS',
    'WLC_PETROPHYSICAL_COMPOSITE_1-66518.DLIS',
            ]})

In [27]:
#test_set['filename'] = test_set['filename'].apply(file_path_formatter)
test_set = count_vectorizer.transform(test_set['filename'])
test_pred = mor.predict(test_set)
print(test_pred)

[['BAKER HUGHES' 'INTERPRETED' 'COMPOSITE']
 ['LOGTEK' 'INTERPRETED' 'COMPOSITE']
 ['LOGTEK' 'INTERPRETED' 'PETROPHYSICAL_COMPOSITE']
 ['SCHLUMBERGER' 'RAW' 'AUTOMATIC']
 ['SCHLUMBERGER' 'INTERPRETED' 'COMPOSITE']
 ['LOGTEK' 'INTERPRETED' 'PETROPHYSICAL_COMPOSITE']]


In [29]:
mor.estimator[-1][-1].classes

AttributeError: 'RandomForestClassifier' object has no attribute 'estimators_'

In [30]:
 pd.options.display.max_colwidth = 300
print(test['filename'][:3])

11066    Gullfaks_Sor_NO_34_10-M-4_BY1HT3_GRL-70730.pdf
2111           WL_STATOIL_ALL_RAW_DATA_MWD_1-98927.DLIS
14510          WL_STATOIL_ALL_RAW_DATA_MWD_2-70969.DLIS
Name: filename, dtype: object


In [31]:
print(pred[:3])

[['GEOSERVICES' 'RAW' 'AUTOMATIC']
 ['BAKER HUGHES' 'RAW' 'AUTOMATIC']
 ['BAKER HUGHES' 'RAW' 'AUTOMATIC']]


In [32]:
print(test['data_type'][:3])

11066    RAW
2111     RAW
14510    RAW
Name: data_type, dtype: object
