Simple model training

TODOs:
1. 

In [1]:
import os
import numpy as np
import pandas as pd

## Import data

In [2]:
dir_path = os.path.realpath('..')

In [3]:
path = 'data/raw/train.csv'

full_path = os.path.join(dir_path, path)
train_df = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*train_df.shape))

Dataset has 95851 rows, 7 columns.


In [4]:
path = 'data/processed/test.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)
test_df = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*test_df.shape))

Dataset has 226998 rows, 1 columns.


## Train model

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score

In [7]:
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

X_train = train_df['comment_text']
y_train = train_df[target]
X_test = test_df['comment_text']

In [8]:
%%time
transformer = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
])

X_train_dtm = transformer.fit_transform(X_train)
X_test_dtm = transformer.transform(X_test)

CPU times: user 28.8 s, sys: 642 ms, total: 29.5 s
Wall time: 29.6 s


In [None]:
# Export model

## Predict

In [16]:
%%time
submission = pd.DataFrame(index=test_df.index, columns=target)

for label in target:
    print('... Processing {}'.format(label))
    y = y_train[label]
    
    # train the model using X_dtm & y
    clf = MultinomialNB()
    clf.fit(X_train_dtm, y)
    
    # compute the training accuracy
    y_pred = clf.predict(X_train_dtm)
    print('Training accuracy is {}'.format(accuracy_score(y, y_pred)))
    
    # compute the predicted probabilities for X_test_dtm
    y_pred_proba = clf.predict_proba(X_test_dtm)[:,1]
    submission[label] = y_pred_proba

... Processing toxic
Training accuracy is 0.9204703132987658
... Processing severe_toxic
Training accuracy is 0.9898488278682539
... Processing obscene
Training accuracy is 0.9518419213153749
... Processing threat
Training accuracy is 0.9967866793252027
... Processing insult
Training accuracy is 0.9522488028293915
... Processing identity_hate
Training accuracy is 0.991434622486985
CPU times: user 1.1 s, sys: 106 ms, total: 1.21 s
Wall time: 1.2 s


In [18]:
path = 'data/processed/submission.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

submission.to_csv(full_path, header=True, index=True)