# Quato data

`csv file` containing filename entries, folders and other metadata

We start by loading this file:

In [None]:
import pandas as pd
import numpy as np
import os
df = pd.read_csv('data/largeDoseOfTurbiniumForQuato_reduced.csv')

### show first few lines

In [None]:
df.head()

### Features

file directory

### Labels

date_logged	log_name	contractor	data_type	log_service	log_activity	log_type	log_job	log_run	log_pass	casing_size_manual	section_size	station_number	station_depth	mfc_correction	interpretation_number	date_stamp

### Clean and rename columns for clarity sake

In [None]:
COLUMN_MAPPING = {
        'directory': 'file_directory',
        'file': 'filename',
        'contractor': 'company',
        'log_service': 'service',
        'log_activity': 'activity',
    }
df.columns = [col.strip() for col in df.columns]
 # column names have lots of surrounding whitepaces
df.columns = [col.strip() for col in df.columns]

# Ensure every string is stripped of random whitepaces.
df = df.applymap(lambda val: val.strip() if hasattr(val, 'strip') else val)

# Rename columns
df = df.rename(columns=COLUMN_MAPPING)

# Fill remaining NaNs with empty target
df = df.fillna(value='')

In [None]:
df.head()

### Test and train data split

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

### Join filename and filedirectory into a single string

In [None]:
#a new dataframe with just the feature data
X_train = pd.DataFrame()
X_test = pd.DataFrame()
X_train['filename'] = train.apply(lambda line: os.path.join(line['file_directory'], line['filename']), axis=1)
X_test['filename'] = test.apply(lambda line: os.path.join(line['file_directory'], line['filename']), axis=1)

### Get company as label

In [None]:
y_train =train['company'].astype(str)
y_test = test['company'].astype(str)

In [None]:
print(X_train.head(), y_train.head())

### Process filename inputs (for both train and test data)

In [None]:
#split filename into independent chunks
def file_path_formatter(string):
        string = string.replace('_', ' ').replace('-', ' ').replace('.', ' ').replace(',', ' ').replace('/', ' ')
        return string

X_train['filename'] = X_train['filename'].apply(file_path_formatter)
X_test['filename'] = X_test['filename'].apply(file_path_formatter)
X.head()

### Extract features using CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1, 2), 
                                    lowercase=True,
                                    analyzer='word')

In [None]:
# creates the document term matrix
count_train = count_vectorizer.fit_transform(X_train['filename'])
count_test = count_vectorizer.transform(X_test['filename'])

### Define ML model that currently is RandomForest as it perform the best

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Parameters for the model
max_depth=None
n_estimators=82
min_samples_split=10
max_features='sqrt'
rf_classifier = RandomForestClassifier(max_depth=None, 
                                       n_jobs=-1,
                                       n_estimators=n_estimators,
                                       min_samples_split=min_samples_split,
                                       max_features=max_features)

# Train ....

In [None]:
print(count_train.shape)
rf_classifier.fit(count_train, y_train)

# Predict on test data

In [None]:
pred = rf_classifier.predict(count_test)

# Show some results

In [None]:
 pd.options.display.max_colwidth = 300
print(test['filename'][:3])

In [None]:
print(pred[:3])

In [None]:
print(test['company'][:3])