# Quato data

`csv file` containing filename entries, folders and other metadata

We start by loading this file:

In [1]:
import pandas as pd
import numpy as np
import os
df = pd.read_csv('data/largeDoseOfTurbiniumForQuato_reduced.csv')

### show first few lines

In [2]:
df.head()

Unnamed: 0,file,directory,date_logged,log_name,contractor,data_type,log_service,log_activity,log_type,log_job,log_run,log_pass,casing_size_manual,section_size,station_number,station_depth,mfc_correction,interpretation_number,date_stamp
0,CONTENT-MUD_LOG_1.DLIS-112806.TXT,/project/recall/prod/arc/norway_a/files/GRANE/...,2014/08/18,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:42:19
1,MUD_LOG_1_INF_1-112807.ASC,/project/recall/prod/arc/norway_a/files/GRANE/...,2014/08/18,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:43:09
2,CONTENT-MUD_LOG_1.DLIS-112797.TXT,/project/recall/prod/arc/norway_a/files/GRANE/...,2014/08/17,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:31:48
3,MUD_LOG_1_INF_1-112798.ASC,/project/recall/prod/arc/norway_a/files/GRANE/...,2014/08/17,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:32:29
4,CONTENT-MUD_LOG_1.DLIS-112715.TXT,/project/recall/prod/arc/norway_a/files/GRANE/...,2014/03/31,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:22:39:44


### Features

file directory

### Labels

date_logged	log_name	contractor	data_type	log_service	log_activity	log_type	log_job	log_run	log_pass	casing_size_manual	section_size	station_number	station_depth	mfc_correction	interpretation_number	date_stamp

### Clean and rename columns for clarity sake

In [3]:
COLUMN_MAPPING = {
        'directory': 'file_directory',
        'file': 'filename',
        'contractor': 'company',
        'log_service': 'service',
        'log_activity': 'activity',
    }
df.columns = [col.strip() for col in df.columns]
 # column names have lots of surrounding whitepaces
df.columns = [col.strip() for col in df.columns]

# Ensure every string is stripped of random whitepaces.
df = df.applymap(lambda val: val.strip() if hasattr(val, 'strip') else val)

# Rename columns
df = df.rename(columns=COLUMN_MAPPING)

# Fill remaining NaNs with empty target
df = df.fillna(value='')

In [4]:
df.head()

Unnamed: 0,filename,file_directory,date_logged,log_name,company,data_type,service,activity,log_type,log_job,log_run,log_pass,casing_size_manual,section_size,station_number,station_depth,mfc_correction,interpretation_number,date_stamp
0,CONTENT-MUD_LOG_1.DLIS-112806.TXT,/project/recall/prod/arc/norway_a/files/GRANE/...,2014/08/18,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:42:19
1,MUD_LOG_1_INF_1-112807.ASC,/project/recall/prod/arc/norway_a/files/GRANE/...,2014/08/18,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:43:09
2,CONTENT-MUD_LOG_1.DLIS-112797.TXT,/project/recall/prod/arc/norway_a/files/GRANE/...,2014/08/17,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:31:48
3,MUD_LOG_1_INF_1-112798.ASC,/project/recall/prod/arc/norway_a/files/GRANE/...,2014/08/17,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:32:29
4,CONTENT-MUD_LOG_1.DLIS-112715.TXT,/project/recall/prod/arc/norway_a/files/GRANE/...,2014/03/31,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:22:39:44


### Test and train data split

In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

### Join filename and filedirectory into a single string

In [6]:
#a new dataframe with just the feature data
X_train = pd.DataFrame()
X_test = pd.DataFrame()
X_train['filename'] = train.apply(lambda line: os.path.join(line['file_directory'], line['filename']), axis=1)
X_test['filename'] = test.apply(lambda line: os.path.join(line['file_directory'], line['filename']), axis=1)

### Get company as label

In [7]:
labels = ['company', 'data_type']
y_train =train[labels].astype(str)
y_test = test[labels].astype(str)

In [8]:
print(X_train.head(), y_train.head())

                                                filename
11083  /project/recall/prod/arc/norway_a/films/GULLFA...
10699  /project/recall/prod/arc/norway_a/tapes/GULLFA...
490    /project/recall/prod/arc/norway_a/films/GRANE/...
13027  /project/recall/prod/arc/norway_a/films/STATFJ...
11183  /project/recall/prod/arc/norway_a/tapes/GULLFA...             company data_type
11083  SCHLUMBERGER       RAW
10699  SCHLUMBERGER       RAW
490    BAKER HUGHES       RAW
13027   GEOSERVICES       RAW
11183  SCHLUMBERGER       RAW


### Process filename inputs (for both train and test data)

In [9]:
#split filename into independent chunks
def file_path_formatter(string):
        string = string.replace('_', ' ').replace('-', ' ').replace('.', ' ').replace(',', ' ').replace('/', ' ')
        return string

### Extract features using CountVectorizer

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
steps = [
    ('cv',CountVectorizer(ngram_range=(1, 2),
                                   preprocessor=file_path_formatter,
                                   lowercase=True,
                                   analyzer='word')),
    ('tfidf', TfidfTransformer())
]
feature = Pipeline(steps) 

In [11]:
# creates the document term matrix
count_train = feature.fit_transform(X_train['filename'])
count_test = feature.transform(X_test['filename'])

### Define ML model that currently is RandomForest as it perform the best

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
# Parameters for the model
max_depth=None
n_estimators=82
min_samples_split=10
max_features='sqrt'
rf_classifier = RandomForestClassifier(max_depth=None, 
                                       n_jobs=-1,
                                       n_estimators=n_estimators,
                                       min_samples_split=min_samples_split,
                                       max_features=max_features)
moc = MultiOutputClassifier(rf_classifier)

# Train ....

In [None]:
print(count_train.shape)
moc.fit(count_train, y_train)

# Predict on test data

In [None]:
pred = moc.predict(count_test)
preda = moc.predict_proba(count_test)

# Show some results

In [None]:
for i, lbl in enumerate(moc.estimators_):
    print(lbl.classes_)

In [None]:
 pd.options.display.max_colwidth = 300
print(test['filename'][:3])
print(y_train['company'].value_counts())

In [None]:
print(pred[:3])
print(preda[0][:3].argmax(axis=1))
print(sorted(y_train['company'].unique()))
for cn in y_train.columns:
    print(cn)

In [None]:
print(test['company'][:3])