In [1]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.feature_extraction.text import HashingVectorizer
import sys
import os

import pickle
import re

import warnings

import kfp
import kfp.components as components
import kfp.dsl as dsl
import kubeflow.fairing.utils

from kfp.components import InputPath, OutputPath

from kubernetes import client, config


import urllib.request


import requests

import os
import boto3
from botocore.client import Config

import pickle
from io import StringIO
from kale.common.serveutils import serve
from text_cleaner_2000.text_cleaner import TextCleaner

warnings.filterwarnings('ignore')
NAMESPACE = kubeflow.fairing.utils.get_current_k8s_namespace()

In [29]:
url = "http://minio-service.kubeflow.svc.cluster.local:9000/hatterasuser/data/ratings_shuffled.csv" #minio server

req = requests.get(url)
content = req.content
df = pd.read_csv(StringIO(str(content, 'utf-8')))

In [30]:
df.head()

Unnamed: 0,review,sentiment
0,"First of all, I have to start this comment by ...",1
1,The brilliance of this story delivers at least...,1
2,Spheeris debut must be one of the best music d...,1
3,I have to admit that I had low expectations fo...,1
4,"Ying, a Chinese girl who speaks Czech, invited...",1


In [None]:
cleaner = TextCleaner()

In [35]:
df['review'] = cleaner.alpha_iterator(df['review'], remove_emoticon=False)

In [36]:
df['review'] = cleaner.stop_word_iterator(df['review'])

In [40]:
x = df['review'].values
y = df['sentiment'].values
xtrain, xtest, ytrain, ytest=train_test_split(x, y, test_size=0.15)
vect = HashingVectorizer(decode_error = 'ignore',
                        n_features = 2**20,
                        preprocessor = None,
                        tokenizer = None)

In [41]:
xtrain = vect.transform(xtrain)

In [42]:
xtest = vect.transform(xtest)

In [43]:
x = vect.transform(x) # for final model

In [37]:
df.head()

Unnamed: 0,review,sentiment
0,start comment saying huge nightmare elm street...,1
1,brilliance story delivers skillfully crafted m...,1
2,spheeris debut best music documentaries time f...,1
3,admit low expectations movie surprised enterta...,1
4,ying chinese girl speaks czech invited screeni...,1


In [44]:
clf = SGDClassifier(loss="log").fit(xtrain, ytrain)

In [45]:
model = SGDClassifier(loss="log").fit(xtest, ytest)

In [46]:
# save file to pickle
with open("model.pkl", "wb") as file:
    
    pickle.dump(model, file)

In [2]:
base = f'kubectl -n {NAMESPACE} get secret mlpipeline-minio-artifact '

command =  base + '-o jsonpath="{.data.accesskey}" | base64 -d'
stream = os.popen(command)
aws_access_key_id  = stream.read()

command = base +  '-o jsonpath="{.data.secretkey}" | base64 -d'
stream = os.popen(command)
aws_secret_access_key  = stream.read()


In [3]:
s3 = boto3.client(
        "s3",
        endpoint_url="http://minio-service.kubeflow.svc.cluster.local:9000",
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        config=Config(signature_version="s3v4"),
    )


In [4]:
model_name = "sentiment-model"
model_version = "1"

s3.upload_file("model.pkl",
                NAMESPACE,
                f"models/{model_name}-{model_version}.pkl",
                ExtraArgs={"ACL": "public-read"},
            )