In [15]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.feature_extraction.text import HashingVectorizer
import sys

import pickle
import re

import warnings

import kfp
import kfp.components as components
import kfp.dsl as dsl
import kubeflow.fairing.utils

from kfp.components import InputPath, OutputPath

import urllib.request
import os

import requests

import os
import boto3
from botocore.client import Config

import pickle

from kale.common.serveutils import serve
from text_cleaner_2000.text_cleaner import TextCleaner

warnings.filterwarnings('ignore')
NAMESPACE = kubeflow.fairing.utils.get_current_k8s_namespace()

In [2]:
NAMESPACE

'hatterasuser'

In [3]:
url = "http://minio-service.kubeflow.svc.cluster.local:9000/hatterasuser/data/ratings_shuffled.csv" #minio server

req = requests.get(url)
content = req.content

In [4]:
with open('data/ratings_shuffled.csv', 'wb') as csv_file:

    csv_file.write(content)

In [5]:
df = pd.read_csv('data/ratings_shuffled.csv')

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,"First of all, I have to start this comment by ...",1
1,The brilliance of this story delivers at least...,1
2,Spheeris debut must be one of the best music d...,1
3,I have to admit that I had low expectations fo...,1
4,"Ying, a Chinese girl who speaks Czech, invited...",1


In [7]:
holdout = df.iloc[[5,6,7,8]]

In [8]:
indexes = [x for x in df.index if x not in holdout.index]

In [9]:
model_data = df.iloc[indexes]

In [10]:
holdout.to_csv("data/holdout.csv", index=False)

In [16]:
cleaner = TextCleaner()

In [17]:
model_data['review'] = cleaner.alpha_iterator(model_data['review'], remove_emoticon=False)

In [18]:
model_data['review'] = cleaner.stop_word_iterator(model_data['review'])

In [19]:
model_data.head()

Unnamed: 0,review,sentiment
0,start comment saying huge nightmare elm street...,1
1,brilliance story delivers skillfully crafted m...,1
2,spheeris debut best music documentaries time f...,1
3,admit low expectations movie surprised enterta...,1
4,ying chinese girl speaks czech invited screeni...,1


In [20]:
x = model_data['review'].values
y = model_data['sentiment'].values

In [21]:
xtrain, xtest, ytrain, ytest=train_test_split(x, y, test_size=0.15)

In [22]:
vect = HashingVectorizer(decode_error = 'ignore',
                        n_features = 2**20,
                        preprocessor = None,
                        tokenizer = None)

In [23]:
xtrain = vect.transform(xtrain)

In [24]:
xtest = vect.transform(xtest)

In [25]:
x = vect.transform(x) # for final model

In [29]:
clf = SGDClassifier(loss="log").fit(xtrain, ytrain)

In [34]:
model = SGDClassifier(loss="log").fit(xtest, ytest)

In [36]:
# save file to pickle
with open("model.pkl", "wb") as file:
    
    pickle.dump(model, file)

In [39]:
with open("credentials.txt", "r") as f:
    
    creds = f.read()
    
aws_access_key_id = creds.split()[0]
aws_secret_access_key = creds.split()[1]

In [52]:
s3 = boto3.client(
        "s3",
        endpoint_url="http://minio-service.kubeflow.svc.cluster.local:9000",
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        config=Config(signature_version="s3v4"),
    )


In [53]:
model_name = "sentiment-model"
model_version = "1"

s3.upload_file("model.pkl",
                NAMESPACE,
                f"models/{model_name}-{model_version}.pkl",
                ExtraArgs={"ACL": "public-read"},
            )