# Quato data

`csv file` containing filename entries, folders and other metadata

We start by loading this file:

In [2]:
import pandas as pd
import numpy as np
import os
df = pd.read_csv('data/largeDoseOfTurbiniumForQuato_reduced.csv')

### show first few lines

In [3]:
df.head()

Unnamed: 0,file,directory,date_logged,log_name,contractor,data_type,log_service,log_activity,log_type,log_job,log_run,log_pass,casing_size_manual,section_size,station_number,station_depth,mfc_correction,interpretation_number,date_stamp
0,CONTENT-MUD_LOG_1.DLIS-112806.TXT,/project/recall/prod/arc/norway_a/files/GRANE/...,2014/08/18,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:42:19
1,MUD_LOG_1_INF_1-112807.ASC,/project/recall/prod/arc/norway_a/files/GRANE/...,2014/08/18,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:43:09
2,CONTENT-MUD_LOG_1.DLIS-112797.TXT,/project/recall/prod/arc/norway_a/files/GRANE/...,2014/08/17,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:31:48
3,MUD_LOG_1_INF_1-112798.ASC,/project/recall/prod/arc/norway_a/files/GRANE/...,2014/08/17,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:14:32:29
4,CONTENT-MUD_LOG_1.DLIS-112715.TXT,/project/recall/prod/arc/norway_a/files/GRANE/...,2014/03/31,AUTOMATIC,BAKER HUGHES,RAW,,,FINAL,,,,,All sections,,,,,2018/04/03:22:39:44


### Features

file directory

### Labels

date_logged	log_name	contractor	data_type	log_service	log_activity	log_type	log_job	log_run	log_pass	casing_size_manual	section_size	station_number	station_depth	mfc_correction	interpretation_number	date_stamp

### Clean and rename columns for clarity sake

In [4]:
COLUMN_MAPPING = {
        'directory': 'file_directory',
        'file': 'filename',
        'contractor': 'company',
        'log_service': 'service',
        'log_activity': 'activity',
    }
df.columns = [col.strip() for col in df.columns]
 # column names have lots of surrounding whitepaces
df.columns = [col.strip() for col in df.columns]

# Ensure every string is stripped of random whitepaces.
df = df.applymap(lambda val: val.strip() if hasattr(val, 'strip') else val)

# Rename columns
df = df.rename(columns=COLUMN_MAPPING)

# Fill remaining NaNs with empty target
df = df.fillna(value='')

### Test and train data split

In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

### Join filename and filedirectory into a single string

In [6]:
#a new dataframe with just the feature data
X_train = pd.DataFrame()
X_test = pd.DataFrame()
X_train['filename'] = train.apply(lambda line: os.path.join(line['file_directory'], line['filename']), axis=1)
X_test['filename'] = test.apply(lambda line: os.path.join(line['file_directory'], line['filename']), axis=1)

### Get company as label

In [15]:
y_train =train['company'].astype(str)
y_test = test['company'].astype(str)

In [16]:
pd.options.display.max_colwidth = 300
X_train.head().values

array([['/project/recall/prod/arc/norway_a/films/VISUND/NO_34_8_A_16_H/Basic_Channel_Description-50341.pdf'],
       ['/project/recall/prod/arc/norway_a/tapes/GULLFAKS/NO_34_10_A_42/Statoil_GFA_34_10-A-42_2-7_8in_HSD_Run_2_Correlation_Pass_ConCu_R4L7Up-31833.las'],
       ['/project/recall/prod/arc/norway_a/tapes/GRANE/NO_25_11_G_6_AY1T3/WL_STATOIL_ALL_RAW_DATA_MWD_6-74449.DLIS'],
       ['/project/recall/prod/arc/norway_a/files/GULLFAKS_S_R/NO_34_10_M_4_BY1HT2/MUD_LOG-70677.TXT'],
       ['/project/recall/prod/arc/norway_a/films/GRANE/NO_25_11_G_22_A/WL_RAW_GR-REMP_MD_MWD_PLOT_3-66214.PDF']],
      dtype=object)

In [17]:
y_test.head().values

array(['STATOIL', 'SCHLUMBERGER', 'BAKER HUGHES', 'SCHLUMBERGER',
       'SCHLUMBERGER'], dtype=object)

### Process filename inputs (for both train and test data)

In [18]:
#split filename into independent chunks
def file_path_formatter(string):
        string = string.replace('_', ' ').replace('-', ' ').replace('.', ' ').replace(',', ' ').replace('/', ' ')
        return string

X_train['filename'] = X_train['filename'].apply(file_path_formatter)
X_test['filename'] = X_test['filename'].apply(file_path_formatter)

In [19]:
X_train.head().values

array([[' project recall prod arc norway a films VISUND NO 34 8 A 16 H Basic Channel Description 50341 pdf'],
       [' project recall prod arc norway a tapes GULLFAKS NO 34 10 A 42 Statoil GFA 34 10 A 42 2 7 8in HSD Run 2 Correlation Pass ConCu R4L7Up 31833 las'],
       [' project recall prod arc norway a tapes GRANE NO 25 11 G 6 AY1T3 WL STATOIL ALL RAW DATA MWD 6 74449 DLIS'],
       [' project recall prod arc norway a files GULLFAKS S R NO 34 10 M 4 BY1HT2 MUD LOG 70677 TXT'],
       [' project recall prod arc norway a films GRANE NO 25 11 G 22 A WL RAW GR REMP MD MWD PLOT 3 66214 PDF']],
      dtype=object)

### Extract features using CountVectorizer

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1, 2), 
                                    lowercase=True,
                                    analyzer='word')

In [25]:
# creates the document term matrix
count_train = count_vectorizer.fit_transform(X_train['filename'])
# print(count_vectorizer.get_feature_names())
count_test = count_vectorizer.transform(X_test['filename'])

### Define ML model that currently is RandomForest as it perform the best

In [26]:
from sklearn.ensemble import RandomForestClassifier
# Parameters for the model
max_depth=None
n_estimators=82
min_samples_split=10
max_features='sqrt'
rf_classifier = RandomForestClassifier(max_depth=None, 
                                       n_jobs=-1,
                                       n_estimators=n_estimators,
                                       min_samples_split=min_samples_split,
                                       max_features=max_features)

# Train ....

In [14]:
print(count_train.shape)
rf_classifier.fit(count_train, y_train)

(11832, 44289)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=82, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# Predict on test data

In [15]:
pred = rf_classifier.predict(count_test)

# Show some results

In [16]:
print(test['filename'][:3])

9645                   WL_RAW_GR-REMP_MWD_MD_PLOT1-33201.pdf
6394    WL_RAW_CALI-DEN-GR-NEU-REMP_TVD_MWD_PLOT_1-54266.PDF
5754                            WL_RAW_EIMG_MWD_1-99420.DLIS
Name: filename, dtype: object


In [17]:
print(pred[:3])

['SCHLUMBERGER' 'SCHLUMBERGER' 'BAKER HUGHES']


In [18]:
print(test['company'][:3])

9645    SCHLUMBERGER
6394    SCHLUMBERGER
5754    BAKER HUGHES
Name: company, dtype: object
