## Test Pipeline Model

In [1]:
import pickle
import pandas as pd
import numpy as np

# Data handling and processing #
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

# TextBlob #
from textblob import TextBlob
import warnings
warnings.filterwarnings('ignore')

# Model preparation #
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

In [2]:
stop = stopwords.words('english')
stop.remove('not')

new_stopwords = []
for item in stop:
    new_words = [char for char in item if char not in string.punctuation]
    new_words = ''.join(new_words)
    new_stopwords.append(new_words)

new_stopwords.append("im")
new_stopwords.append("i'm")
new_stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre']

In [3]:
clothes = ['top', 'dress', 'blouse', 'shirt', 'skirt', 'jeans', 'jean', 'jumpsuit','color','camisole']

In [4]:
# Cleaning Text #

def clean_process(text):
    #make lowercase
    clean_text = text.lower()

    #remove punctuation and numbers#
    clean_text = [char for char in clean_text if char not in string.punctuation]
    clean_text = [char for char in clean_text if char not in string.digits]
    clean_text = ''.join(clean_text)
    
    #remove spasi kelebihan di depan/akhir review#
    clean_text = clean_text.strip()
    
    #Spelling Correction#
    clean_text = TextBlob(clean_text).correct()

    #remove stopwords#
    clean_text = [word for word in clean_text.split(' ') if word not in stopwords.words('english')]
    clean_text = [word for word in clean_text if word not in new_stopwords]
    
    #make it whole again#
    clean_text = ' '.join(clean_text)
    
    #stringnya di-tokenize dulu menjadi token berupa kata (word token)#
    clean_text = clean_text.split()
    
    #setiap tokennya di lemmatize
    new_string=[]
    for word in clean_text:
        x_word = lemmatizer.lemmatize(word)
        new_string.append(x_word)

    return new_string

In [5]:
model = pickle.load(open('pipeline.sav', 'rb'))

In [6]:
model.steps

[('bow',
  CountVectorizer(analyzer=<function clean_process at 0x000001ED8A46F5E8>,
                  binary=False, decode_error='strict',
                  dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                  lowercase=True, max_df=0.9, max_features=None, min_df=0.005,
                  ngram_range=(1, 2), preprocessor=None, stop_words=None,
                  strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                  tokenizer=None, vocabulary=None)),
 ('tfidf',
  TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
 ('classifier',
  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min

In [7]:
model_1 = pickle.load(open('pipeline_1.sav', 'rb'))

In [8]:
model_2 = pickle.load(open('pipeline_3.sav', 'rb'))

In [9]:
model_3 = pickle.load(open('pipeline_4.sav', 'rb'))

In [10]:
tes1 = 'I love this dress, so pretty!!'
tes2 = "The dress is weird and not fit, disappointed"
tes3 = "Size doesn't fit, too small"
tes4 = "Pattern is lovely, but the dress size is wrong"
tes5 = "I don't like the dress, returned it"

In [25]:
d = {'col': tes2}
df_res = pd.DataFrame(data=d, index=[0])
df_res

Unnamed: 0,col
0,"The dress is weird and not fit, disappointed"


In [12]:
isian = {0:'Not Recommended', 1:'Recommended'}

Test Model Bandingin Pipeline, Pipeline 1, Pipeline 3

In [26]:
result = model.predict(df_res['col'])[0]
prob_num = model.predict_proba(df_res['col'])
prob = (prob_num[0][result])*100
print(f"{isian[result]} dengan probability {round(prob, 2)} %")

Not Recommended dengan probability 59.0 %


In [27]:
result = model_1.predict(df_res['col'])[0]
prob_num = model_1.predict_proba(df_res['col'])
prob = (prob_num[0][result])*100
print(f"{isian[result]} dengan probability {round(prob, 2)} %")

Not Recommended dengan probability 68.0 %


In [28]:
result = model_2.predict(df_res['col'])[0]
prob_num = model_2.predict_proba(df_res['col'])
prob = (prob_num[0][result])*100
print(f"{isian[result]} dengan probability {round(prob, 2)} %")

Not Recommended dengan probability 81.0 %


In [29]:
#decision tree
result = model_3.predict(df_res['col'])[0]
prob_num = model_3.predict_proba(df_res['col'])
prob = (prob_num[0][result])*100
print(f"{isian[result]} dengan probability {round(prob, 2)} %")

Recommended dengan probability 67.31 %


Ketika model dites dengan contoh review, model paling stabil adalah Pipeline (menggunakan RFC percobaan pertama). Meskipun probabilitynya tidak bisa sampai bagus dalam menebak kelass 0 (hanya sekitar 50-60%) tetapi hampir bisa menebak kelasnya, dibandingkan decision tree yg kadang bisa menebak tes5 dgn prob 90% tetapi fail di tes2.

### Done.