In [1]:
!pip install python-pachyderm==6.1.0 -q
!pip install scikit-learn==0.24.2 -q
!pip install pandas==1.2.4 -q
!pip install psycopg2-binary~=2.7.7 -q
!pip install nltk~=3.6.2 -q

In [2]:
import logging
import python_pachyderm
import torch
from torch.utils.data import Dataset
import psycopg2
from psycopg2 import sql
from urllib.parse import urlparse
import pandas as pd
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [3]:
def connect_pachy():
    connection = None
    try:
        PACHYDERM_HOST = '10.64.140.44'
        PACHYDER_PORT = 650
        # connection = python_pachyderm.Client(host='127.0.0.1', port=30600)
        connection = python_pachyderm.Client(host="10.64.140.44", port=650)
    except Exception as error:
        print(error)

    print("Pachyderm connection successful")
    return connection


def connect_postgres():
    connection = None
    try:
        # TODO: add secrets for postgres credentials
        connection = psycopg2.connect(user="postgres",
                                      password="postgres",
                                      host="172.23.76.93",
                                      port="5432",
                                      database="bbc-news")
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    print("Database connection successful")
    return connection

In [4]:
class NewsDataset(Dataset):
    vectorizer = None
    label_encoder = None
    
    train_query = sql.SQL("SELECT label_id, label, filepath FROM bbc_train WHERE branch = %s")
    test_query = sql.SQL("SELECT label_id, label, filepath FROM bbc_test WHERE branch = %s")

    # train branch repo, test branch repo
    def __init__(self, mode: str = 'train', label_source_bucket: str = 'bbc-train-validate', branch: str = "master"):
        
        self.bucket = f'{branch}.{label_source_bucket}'
        
        # define the labeling encoding
        le = LabelEncoder()
        bbc_classes = pd.read_csv("bbc.classes", header=None)
        print(bbc_classes.iloc[:, 0].values.astype('U'))
        le.fit(bbc_classes.iloc[:, 0].values.astype('U'))
        self.le = le
        
        # define the text vectorizer
        voc = pd.read_csv("bbc.vocab", header=None)
        stemmer = PorterStemmer()
        def stemmed_words(doc):
            st = [stemmer.stem(w) for w in analyzer(doc)]
            return st

        analyzer = CountVectorizer().build_analyzer()
        stem_vectorizer = CountVectorizer(analyzer=stemmed_words)
        stem_vectorizer.fit(voc.iloc[:, 0].values.astype('U'))
        print(f'Fitted vectorizer with feature length {len(stem_vectorizer.get_feature_names())}')
        self.vectorizer = stem_vectorizer
        
        if mode=='train':
            train_x, val_x, train_y, val_y =  self.load_train_dataset()
            self.data_frame = train_x
            self.labels = train_y
        elif mode=='validate':
            train_x, val_x, train_y, val_y =  self.load_train_dataset()
            self.data_frame = val_x
            self.labels = val_y
        
        elif mode=='test':
            test_x, test_y =  self.load_test_dataset()
            self.data_frame = test_x
            self.labels = test_y
        else: print('invalid mode')
        
        print("Dataset generated")
        print(f'Dataset size in mode {mode}: {self.data_frame.shape}, labels {self.labels.shape}')
        
    def load_train_dataset(self):
        print("Loading training data and labels..")
        train_df = self.get_data_set(self.train_query, self.bucket)
        print("Data loaded")
        print("Splitting data")
        train_x_df, val_x_df, train_y_df, val_y_df = train_test_split(train_df['text'], train_df['label'],
                                                                      test_size=0.2, random_state=42)

        train_y = self.le.transform(train_y_df)
        val_y = self.le.transform(val_y_df)

        train_x = self.vectorizer.transform(train_x_df)
        val_x = self.vectorizer.transform(val_x_df)

        train_x = torch.tensor(train_x.toarray()).float()
        train_y = torch.tensor(train_y)
        val_x = torch.tensor(val_x.toarray()).float()
        val_y = torch.tensor(val_y)
        
        
        return train_x, val_x, train_y, val_y
    
    def load_test_dataset(self):
        print("Loading test data and labels..")
        test_df = self.get_data_set(self.test_query, self.bucket)
        print("Data loaded")

        test_y = self.le.transform(test_df['label'])
        test_x = self.vectorizer.transform(test_df['text'])
        test_y = torch.tensor(test_y)
        test_x = torch.tensor(test_x.toarray()).float()
        
        return test_x, test_y
    
            
    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        data = self.data_frame[idx]
        label = self.labels[idx]
        sample = (data, label)
        return sample

    def get_data_set(self, query, bucket):
        pg = connect_postgres()
        cursor = pg.cursor()
        pc = connect_pachy()

        try:
            cursor.execute(query, (bucket, ))

        except (Exception, psycopg2.DatabaseError) as error:
            print("Error: %s" % error)
            cursor.close()
            print(error)

        tupples = cursor.fetchall()
        cursor.close()
        pg.close()

        column_names = ["label_id", "label", "filepath"]
        df = pd.DataFrame(tupples, columns=column_names)
        df.insert(len(df.columns), 'text', '')

        try:
            for index, row in df.iterrows():
                url = df.at[index, 'filepath']
                branch, bucket = urlparse(url).netloc.split(".")
                _, file = urlparse(url).path.split("/")
                train_source = pc.get_file(f'{bucket}/{branch}', file)
                text = train_source.read().decode(errors='replace')
                df.at[index, 'text'] = text
                print(f'{index}/{len(df.index)}', end='\r')

        except Exception as error:
            print(error)
        return df

    def get_text_vectorizer(self):
        return self.vectorizer

    def get_label_encoder(self):
        return self.le

In [8]:
ds = NewsDataset(mode='train', label_source_bucket='bbc-train-validate', branch='master')

['business' 'entertainment' 'politics' 'sport' 'tech']
Fitted vectorizer with feature length 9474
Loading training data and labels..
Database connection successful
Pachyderm connection successful
Data loaded
Splitting data
Dataset generated
Dataset size in mode train: torch.Size([1424, 9474]), labels torch.Size([1424])


In [None]:
torch.set_printoptions(edgeitems=2000)
it = ds.__getitem__(8)
print(it[0])

In [5]:
!det model list

 Name                  | Creation Time               | Last Updated Time           | Metadata
-----------------------+-----------------------------+-----------------------------+--------------------------------------------
 digit-classifier      | 2021-04-30T14:06:09.274685Z | 2021-04-30T14:06:09.274685Z | {}
 fashion-classifier    | 2021-05-01T08:23:07.977009Z | 2021-05-01T12:49:21.861159Z | {
                       |                             |                             |   "environment": "production"
                       |                             |                             | }
 cifar10-torch         | 2021-05-05T13:46:12.021633Z | 2021-05-05T13:46:12.021633Z | {}
 fashion-classifier-tf | 2021-05-01T13:15:33.106873Z | 2021-05-12T16:50:03.683927Z | {
                       |                             |                             |   "metrics": {
                       |                             |                             |     "test_accuracy": "0.8699",
         

In [6]:
!det model describe news-classifier

 Name            | Description   | Creation Time               | Last Updated Time           | Metadata
-----------------+---------------+-----------------------------+-----------------------------+------------------------------------------------
 news-classifier |               | 2021-05-14T11:35:48.931291Z | 2021-05-22T06:46:37.807147Z | {
                 |               |                             |                             |     "metrics": {
                 |               |                             |                             |         "test_accuracy": "0.9617977528089887",
                 |               |                             |                             |         "test_loss": 5
                 |               |                             |                             |     }
                 |               |                             |                             | }


   Version # |   Trial ID |   Batch # | Checkpoint UUID                      | Valid

In [15]:
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
from torch.utils.data import DataLoader, Dataset
from determined.experimental import Determined

class TestModelAccuracy():
    DET_MASTER = "http://10.64.140.43:8080"
    MODEL_NAME = "news-classifier"
    ACCURACY_THRESHOLD = 0.6

    #def setUp(self):
    def __init__(self):

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        logging.info("Testing model %s with for accuracy", self.MODEL_NAME)
        logging.info("Testing accuracy threshold at %f", self.ACCURACY_THRESHOLD.__float__())

        # get latest version from registered model
        #self.det = Determined(master=self.DET_MASTER)
        self.det = Determined(master=self.DET_MASTER)
        self.model_latest = self.det.get_model(self.MODEL_NAME).get_version()
        self.trial = self.det.get_trial(self.model_latest.trial_id)
        self.checkpoint = self.trial.select_checkpoint(uuid=self.model_latest.uuid)
        self.model = self.checkpoint.load(map_location=self.device)

        logging.info("Connected to Determined master at %s", self.DET_MASTER)
        logging.info("Loaded model %s version %s", self.model_latest.model_name,
                     self.model_latest.model_version)
        logging.info("Corresponding experiment %s", self.model_latest.experiment_id)
        logging.info("Loaded model from checkpoint")

    def test_model_accuracy(self):

        # load testing data from pachyderm
        test_ds = NewsDataset(mode='test', label_source_bucket='bbc-test-validate', branch='master')
        testloader = DataLoader(test_ds)

        classes = ["business", "entertainment", "politics", "sport", "tech"]
        correct = 0
        total = 0
        # since we're not training, we don't need to calculate the gradients for our outputs
        with torch.no_grad():
            for data in testloader:
                texts, labels = data
                # calculate outputs by running images through the network
                outputs = self.model.model(texts)
                # the class with the highest energy is what we choose as prediction
                _, predicted = torch.max(outputs.data, 1)
                correct += (predicted == labels).sum().item()
                total += 1

        test_acc = correct / total
        # write the test results to model metadata
        self.det.get_model(self.MODEL_NAME). \
            add_metadata({"metrics": {"test_accuracy": str(test_acc),
                                      "test_loss": 5}})

        # also write the test descriptions to the model checkpoint
        self.checkpoint.add_metadata({"metrics": {"test_accuracy": str(test_acc),
                                                  "test_loss": 5}})

        logging.info('Accuracy of the network on the 10000 test images: %.2f', test_acc)
        logging.info('Accuracy threshold set at %.2f', self.ACCURACY_THRESHOLD)

        #self.assertTrue(test_acc > self.ACCURACY_THRESHOLD, "Model accuracy lower than threshold")


In [16]:
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
ta = TestModelAccuracy()

INFO:root:Testing model news-classifier with for accuracy
INFO:root:Testing accuracy threshold at 0.600000
INFO:root:Loading Trial implementation with entrypoint model_def:BBCTrial.
INFO:root:Connected to Determined master at http://10.64.140.43:8080
INFO:root:Loaded model news-classifier version 3
INFO:root:Corresponding experiment 98
INFO:root:Loaded model from checkpoint


In [17]:
ta.test_model_accuracy()

['business' 'entertainment' 'politics' 'sport' 'tech']
Fitted vectorizer with feature length 9474
Loading test data and labels..
Database connection successful
Pachyderm connection successful
Data loaded
Dataset generated
Dataset size in mode test: torch.Size([445, 9474]), labels torch.Size([445])


INFO:root:Accuracy of the network on the 10000 test images: 0.97
INFO:root:Accuracy threshold set at 0.60


In [18]:
!det model describe 'news-classifier'

 Name            | Description   | Creation Time               | Last Updated Time           | Metadata
-----------------+---------------+-----------------------------+-----------------------------+------------------------------------------------
 news-classifier |               | 2021-05-14T11:35:48.931291Z | 2021-05-22T13:26:59.168328Z | {
                 |               |                             |                             |     "metrics": {
                 |               |                             |                             |         "test_accuracy": "0.9730337078651685",
                 |               |                             |                             |         "test_loss": 5
                 |               |                             |                             |     }
                 |               |                             |                             | }


   Version # |   Trial ID |   Batch # | Checkpoint UUID                      | Valid

In [42]:
from determined.experimental import Determined
import logging
import torch
import numpy as np
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
import os
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

class NewsModel(object):
    """
    Model template. You can load your model parameters in __init__ from a location accessible at runtime
    """

    def __init__(self, det_master="http://10.64.140.43:8080/", model_name="news-classifier"):
        """
        Add any initialization parameters. These will be passed at runtime from the graph definition parameters defined in your seldondeployment kubernetes resource manifest.
        """

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.det = Determined(master=det_master)
        self.model_latest = self.det.get_model(model_name).get_version()
        self.trial = self.det.get_trial(self.model_latest.trial_id)
        self.checkpoint = self.trial.select_checkpoint(uuid=self.model_latest.uuid)
        self.model = self.checkpoint.load(map_location=self.device)

        logging.info("Connected to Determined master at %s", det_master)
        logging.info("Loaded model %s version %s", self.model_latest.model_name,
                     self.model_latest.model_version)
        logging.info("Corresponding experiment %s", self.model_latest.experiment_id)
        logging.info("Loaded model from checkpoint")
        
        #TODO: download by UUID
        uuid = self.checkpoint.uuid
        self.checkpoint.download()
        
        #f = open(f'checkpoints/{uuid}/code/bbc.vocab')
        #bbc_vocab = f.read()
        voc = pd.read_csv(f'checkpoints/{uuid}/code/bbc.vocab', header = None)
        bbc_classes = pd.read_csv(f'checkpoints/{uuid}/code/bbc.classes', header = None)
        
        le = LabelEncoder()
        bbc_classes = pd.read_csv("bbc.classes", header=None)
        logging.info(bbc_classes.iloc[:, 0].values.astype('U'))
        le.fit(bbc_classes.iloc[:, 0].values.astype('U'))
        self.le = le
        
        stemmer = PorterStemmer()
        def stemmed_words(doc):
            st = [stemmer.stem(w) for w in analyzer(doc)]
            return st

        analyzer = CountVectorizer().build_analyzer()
        stem_vectorizer = CountVectorizer(analyzer=stemmed_words)
        stem_vectorizer.fit(voc.iloc[:, 0].values.astype('U'))
        print(f'Fitted vectorizer with feature length {len(stem_vectorizer.get_feature_names())}')
        self.vectorizer = stem_vectorizer

    def predict(self, X, features_names=None):
        """
        Return a prediction.
        Parameters
        ----------
        X : array-like
        """
        
        logging.info("Predict called")

        #probability_model._make_predict_function()
        with torch.no_grad():
            X_tf = self.vectorizer.transform(X)
            #X_tf = np.expand_dims(X_tf, axis=0)
            X_tf = torch.tensor(X_tf.toarray()).float()
            X_tf = torch.Tensor(X_tf)
            pred = self.model.model(X_tf)

        label_id = np.argmax(pred[0])
        label = self.le.inverse_transform([label_id,])
        logging.info("Predicted news class: %s", label[0])

        #return self.labels[label_class]

In [43]:
model = NewsModel()

INFO:root:Loading Trial implementation with entrypoint model_def:BBCTrial.
INFO:root:Connected to Determined master at http://10.64.140.43:8080/
INFO:root:Loaded model news-classifier version 3
INFO:root:Corresponding experiment 98
INFO:root:Loaded model from checkpoint
INFO:root:['business' 'entertainment' 'politics' 'sport' 'tech']


Fitted vectorizer with feature length 9474


In [50]:
#actual class: tech
text = """
Apple boss Tim Cook took the witness stand for the first time in his company's major legal battle with Epic Games over an alleged monopoly.
Epic, maker of the hit video game Fortnite, claims Apple's tight control over iPhone apps hurts competition.
During his appearance, Mr Cook argued that keeping control of the App Store helped keep iPhones secure.
He also said he did not know if the App Store made a profit, telling the court Apple did not break down the figures.
Facing questions about the level of profit the App Store generates from the 30% commission it takes on sales, he said: "We don't have a separate profit and loss statement for the App Store."
Instead, he said that he had a "feeling" that it was profitable - but could not share figures with the court.
Mr Cook was being questioned about his oversight of top-level decisions around the App Store's policies.
Judge Yvonne Gonzalez Rogers also questioned Mr Cook, asking about a survey that showed 39% of developers are dissatisfied with the app store.
"""

In [51]:
model.predict([text])

INFO:root:Predict called
INFO:root:Predicted news class: tech


In [55]:
#actual class: business
text2 = """
Consumers are still sceptical about electric cars and switching from petrol and diesel remains "a real challenge", the boss of Ford UK has told the BBC.
Lisa Brankin said more government support for the electric car market would be needed ahead of a proposed ban on new petrol and diesel sales in 2030.
Research from energy regulator Ofgem suggests 6.5 million households plan to buy electric cars by 2030.
But the number of electric vehicles (EVs) currently in use remains low.
Research from the Society of Motor Manufacturers and Traders (SMMT) found EVs account for just over 1% of the 35 million vehicles on UK roads.
However, numbers are increasing, with sales of battery-powered vehicles more than doubling last year while the number of plug-in hybrids also grew by more than a third.
"""

In [56]:
model.predict([text2])

INFO:root:Predict called
INFO:root:Predicted news class: business
