In [1]:
import kfp
import kfp.dsl as dsl
import kfp.components as components
from kfp.components import func_to_container_op, InputPath, OutputPath
from typing import NamedTuple
import os


In [2]:
def read_data(data_path:InputPath,HTML_TEMPLATE:str,TEXT_HTML_TEMPLATE:str,log_folder:str)-> NamedTuple('Outputs', [('logdir',str),('mlpipeline_ui_metadata', 'UI_metadata')]):
    import joblib
    import pandas as pd 
    import numpy as np 
    from collections import namedtuple
    import json
    import io
    
    
    buffer = io.StringIO()
    raw_reviews = pd.read_csv(data_path)
## print shape of dataset with rows and columns and information 
    print ("The shape of the  data is (row, column):"+ str(raw_reviews.shape))
    print (raw_reviews.info())
    raw_reviews.info(buf=buffer)
    df_info = buffer.getvalue()
    joblib.dump(raw_reviews, log_folder + '/raw_reviews.pkl')
    html_content = HTML_TEMPLATE % raw_reviews.head().to_html(classes='table table-striped') + TEXT_HTML_TEMPLATE.format(df_info)
    
    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage':'inline',
            'source': html_content,                #'<h1>Hello, World!</h1>',
        }]
    }
   
    visualization_output = namedtuple('VisualizationOutput', ['logdir' ,'mlpipeline_ui_metadata'])
    return visualization_output(log_folder,json.dumps(metadata))
    
    #return ([(log_folder),()])

In [3]:
# import pandas as pd

# tr = pd.read_csv("Musical_instruments_reviews.csv")


In [4]:
def data_preprocessing(log_folder:str,HTML_TEMPLATE:str,TEXT_HTML_TEMPLATE:str)-> NamedTuple('Outputs', [('logdir',str),('mlpipeline_ui_metadata', 'UI_metadata')]):
    
    import numpy as np
    import joblib
    import os
    import pandas as pd 
    import numpy as np 
    from collections import namedtuple
    import json
    #Creating a copy
    raw_reviews = joblib.load(open(log_folder + '/raw_reviews.pkl','rb'))
    process_reviews=raw_reviews.copy()
    #Checking for null values
    print(process_reviews.isnull().sum())
    
    before = TEXT_HTML_TEMPLATE.format("Statistics before filling null values: \n"+ str(process_reviews.isnull().sum()))
    process_reviews['reviewText']=process_reviews['reviewText'].fillna('Missing')
    process_reviews['reviews']=process_reviews['reviewText']+process_reviews['summary']
    process_reviews=process_reviews.drop(['reviewText', 'summary'], axis=1)
    
    after = TEXT_HTML_TEMPLATE.format("Statistics before filling null values: \n"+ str(process_reviews.isnull().sum()))
    print(process_reviews.head())
    joblib.dump(process_reviews, log_folder + '/process_reviews.pkl')
    
    html_content = HTML_TEMPLATE % process_reviews.head().to_html(classes='table table-striped') + before+after
    
    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage':'inline',
            'source': html_content,              
        }]
    }
    visualization_output = namedtuple('VisualizationOutput', ['logdir' ,'mlpipeline_ui_metadata'])
    return visualization_output(log_folder,json.dumps(metadata))

In [5]:
def feature_creation(log_folder:str,HTML_TEMPLATE:str,TEXT_HTML_TEMPLATE:str)-> NamedTuple('Outputs', [('logdir',str),('mlpipeline_ui_metadata', 'UI_metadata')]):
    
    import numpy as np
    import joblib
    import os
    import pandas as pd 
    import numpy as np 
    from collections import namedtuple
    import json
    
    process_reviews = joblib.load(open(log_folder + '/process_reviews.pkl','rb'))
    print(process_reviews['overall'].value_counts())
    def f(row):
    
        '''This function returns sentiment value based on the overall ratings from the user'''

        if row['overall'] == 3.0:
            val = 'Neutral'
        elif row['overall'] == 1.0 or row['overall'] == 2.0:
            val = 'Negative'
        elif row['overall'] == 4.0 or row['overall'] == 5.0:
            val = 'Positive'
        else:
            val = -1
        return val
    process_reviews['sentiment'] = process_reviews.apply(f, axis=1)
    print(process_reviews.head())
    
    senti = TEXT_HTML_TEMPLATE.format("Sentiments values counts: \n"+ str(process_reviews['sentiment'].value_counts()))
    print(process_reviews['sentiment'].value_counts())
    
    joblib.dump(process_reviews, log_folder + '/process_reviews.pkl')
    
    html_content = HTML_TEMPLATE % process_reviews.head().to_html(classes='table table-striped') + senti
    
    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage':'inline',
            'source': html_content,              
        }]
    }
    visualization_output = namedtuple('VisualizationOutput', ['logdir' ,'mlpipeline_ui_metadata'])
    return visualization_output(log_folder,json.dumps(metadata))

In [6]:
def time_related_feature_creation(log_folder:str,HTML_TEMPLATE:str,TEXT_HTML_TEMPLATE:str)-> NamedTuple('Outputs', [('logdir',str),('mlpipeline_ui_metadata', 'UI_metadata')]):
    
    import numpy as np
    import joblib
    import os
    import pandas as pd 
    import numpy as np 
    from collections import namedtuple
    import json
    
    process_reviews = joblib.load(open(log_folder + '/process_reviews.pkl','rb'))
    # new data frame which has date and year
    new = process_reviews["reviewTime"].str.split(",", n = 1, expand = True) 

    # making separate date column from new data frame 
    process_reviews["date"]= new[0] 

    # making separate year column from new data frame 
    process_reviews["year"]= new[1] 

    process_reviews=process_reviews.drop(['reviewTime'], axis=1)
    df = process_reviews.head().to_html(classes='table table-striped')
    
        # Splitting the date 
    new1 = process_reviews["date"].str.split(" ", n = 1, expand = True) 

    # adding month to the main dataset 
    process_reviews["month"]= new1[0] 

    # adding day to the main dataset 
    process_reviews["day"]= new1[1] 

    process_reviews=process_reviews.drop(['date'], axis=1)
    process_reviews.head()
    df += process_reviews.head().to_html(classes='table table-striped')
    joblib.dump(process_reviews, log_folder + '/process_reviews.pkl')
    
    html_content = HTML_TEMPLATE % df
    
    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage':'inline',
            'source': html_content,              
        }]
    }
    visualization_output = namedtuple('VisualizationOutput', ['logdir' ,'mlpipeline_ui_metadata'])
    return visualization_output(log_folder,json.dumps(metadata))

In [7]:
def feature_extraction(log_folder:str,HTML_TEMPLATE:str,TEXT_HTML_TEMPLATE:str)-> NamedTuple('Outputs', [('logdir',str),('mlpipeline_ui_metadata', 'UI_metadata')]):
    # Splitting the dataset based on comma and square bracket 
    import numpy as np
    import joblib
    import os
    import pandas as pd 
    import numpy as np 
    from collections import namedtuple
    import json
    
    process_reviews = joblib.load(open(log_folder + '/process_reviews.pkl','rb'))
    
    new1 = process_reviews["helpful"].str.split(",", n = 1, expand = True)
    new2 = new1[0].str.split("[", n = 1, expand = True)
    new3 = new1[1].str.split("]", n = 1, expand = True)

    #Resetting the index
    new2.reset_index(drop=True, inplace=True)
    new3.reset_index(drop=True, inplace=True)

    #Dropping empty columns due to splitting 
    new2=new2.drop([0], axis=1)
    new3=new3.drop([1], axis=1)

    #Concatenating the splitted columns
    helpful=pd.concat([new2, new3], axis=1)

    # I found few spaces in new3, so it is better to strip all the values to find the rate
    def trim_all_columns(df):
        """
        Trim whitespace from ends of each value across all series in dataframe
        """
        trim_strings = lambda x: x.strip() if isinstance(x, str) else x
        return df.applymap(trim_strings)

    #Applying the function
    helpful= trim_all_columns(helpful)

    #Converting into integer types
    helpful[0]=helpful[0].astype(str).astype(int)
    helpful[1]=helpful[1].astype(str).astype(int)

    #Dividing the two columns, we have 0 in the second columns when dvided gives error, so I'm ignoring those errors
    try:
        helpful['result'] = helpful[1]/helpful[0]
    except ZeroDivisionError:
        helpful['result']=0

    #Filling the NaN values(created due to dividing) with 0
    helpful['result'] = helpful['result'].fillna(0)

    #Rounding of the results to two decimal places
    helpful['result']=helpful['result'].round(2) 

    #Attaching the results to a new column of the main dataframe
    process_reviews['helpful_rate']=helpful['result']

    #dropping the helpful column from main dataframe
    process_reviews=process_reviews.drop(['helpful'], axis=1)

    df = process_reviews.head().to_html(classes='table table-striped') 
    helpful = TEXT_HTML_TEMPLATE.format("Helpful rate value count")
    helpful += TEXT_HTML_TEMPLATE.format(process_reviews['helpful_rate'].value_counts())
    
    joblib.dump(process_reviews, log_folder + '/process_reviews.pkl')
    
    html_content = HTML_TEMPLATE % df + helpful
    
    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage':'inline',
            'source': html_content,              
        }]
    }
    visualization_output = namedtuple('VisualizationOutput', ['logdir' ,'mlpipeline_ui_metadata'])
    return visualization_output(log_folder,json.dumps(metadata))
    

In [8]:
def data_cleaning(log_folder:str,HTML_TEMPLATE:str,TEXT_HTML_TEMPLATE:str)-> NamedTuple('Outputs', [('logdir',str),('mlpipeline_ui_metadata', 'UI_metadata')]):
    
    import numpy as np
    import joblib
    import os
    import pandas as pd 
    import numpy as np 
    import re
    import string
    from collections import namedtuple
    process_reviews = joblib.load(open(log_folder + '/process_reviews.pkl','rb'))
    import json
    
    #Removing unnecessary columns
    process_reviews=process_reviews.drop(['reviewerName','unixReviewTime'], axis=1)
    #Creating a copy 
    clean_reviews=process_reviews.copy()
    def review_cleaning(text):
        '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
        and remove words containing numbers.'''
        text = str(text).lower()
        text = re.sub('\[.*?\]', '', text)
        text = re.sub('https?://\S+|www\.\S+', '', text)
        text = re.sub('<.*?>+', '', text)
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub('\n', '', text)
        text = re.sub('\w*\d\w*', '', text)
        return text
    process_reviews['reviews']=process_reviews['reviews'].apply(lambda x:review_cleaning(x))
    print(process_reviews.head())
    stop_words= ['yourselves', 'between', 'whom', 'itself', 'is', "she's", 'up', 'herself', 'here', 'your', 'each', 
                 'we', 'he', 'my', "you've", 'having', 'in', 'both', 'for', 'themselves', 'are', 'them', 'other',
                 'and', 'an', 'during', 'their', 'can', 'yourself', 'she', 'until', 'so', 'these', 'ours', 'above', 
                 'what', 'while', 'have', 're', 'more', 'only', "needn't", 'when', 'just', 'that', 'were', "don't", 
                 'very', 'should', 'any', 'y', 'isn', 'who',  'a', 'they', 'to', 'too', "should've", 'has', 'before',
                 'into', 'yours', "it's", 'do', 'against', 'on',  'now', 'her', 've', 'd', 'by', 'am', 'from', 
                 'about', 'further', "that'll", "you'd", 'you', 'as', 'how', 'been', 'the', 'or', 'doing', 'such',
                 'his', 'himself', 'ourselves',  'was', 'through', 'out', 'below', 'own', 'myself', 'theirs', 
                 'me', 'why', 'once',  'him', 'than', 'be', 'most', "you'll", 'same', 'some', 'with', 'few', 'it',
                 'at', 'after', 'its', 'which', 'there','our', 'this', 'hers', 'being', 'did', 'of', 'had', 'under',
                 'over','again', 'where', 'those', 'then', "you're", 'i', 'because', 'does', 'all']
    
    words = TEXT_HTML_TEMPLATE.format("stop_words \n "+str(stop_words))
    process_reviews['reviews'] = process_reviews['reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    print(process_reviews.head())
    words += process_reviews.head().to_html()
    words += TEXT_HTML_TEMPLATE.format(pd.DataFrame(process_reviews.groupby('sentiment')['helpful_rate'].mean()))
    print(pd.DataFrame(process_reviews.groupby('sentiment')['helpful_rate'].mean()))
    
    joblib.dump(process_reviews, log_folder + '/process_reviews.pkl')
    joblib.dump(stop_words,log_folder + '/stop_words.pkl')
    
    html_content = HTML_TEMPLATE % words
    
    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage':'inline',
            'source': html_content,              
        }]
    }
    visualization_output = namedtuple('VisualizationOutput', ['logdir' ,'mlpipeline_ui_metadata'])
    return visualization_output(log_folder,json.dumps(metadata))

In [9]:
def sentiment_categories_visualize_data(log_folder:str,IMAGE_HTML_TEMPLATE:str,HTML_TEMPLATE:str)-> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    import numpy as np
    import joblib
    import os
    os.system('pip install plotly')
    os.system('pip install cufflinks')
    os.system('pip install seaborn')
    os.system('pip install matplotlib')
   
    import pandas as pd 
    import numpy as np 
    import matplotlib.pyplot as plt 
    from matplotlib import rcParams
    import seaborn as sns
    from plotly import tools
    import plotly.graph_objs as go
    from plotly.offline import iplot
    from collections import namedtuple
    import base64
    from io import BytesIO
    import json
        
    process_reviews = joblib.load(open(log_folder + '/process_reviews.pkl','rb'))
    
    #plot layout
    plt.rcParams.update({'font.size': 18})
    rcParams['figure.figsize'] = 16,9

    # Creating dataframe and removing 0 helpfulrate records
    senti_help= pd.DataFrame(process_reviews, columns = ['sentiment', 'helpful_rate'])
    senti_help = senti_help[senti_help['helpful_rate'] != 0.00] 

    #Plotting phase
    ax = sns.violinplot( x=senti_help["sentiment"], y=senti_help["helpful_rate"])
   # plt.title('Sentiment vs Helpfulness')
    ax.set_xlabel('Sentiment categories')
    ax.set_ylabel('helpful rate')
    
    fig = ax.get_figure()
    tmpfile = BytesIO()
    fig.savefig(tmpfile, format='png')
    encoded = base64.b64encode(tmpfile.getvalue()).decode('utf-8')
    
    html_content = HTML_TEMPLATE % IMAGE_HTML_TEMPLATE.format('Sentiment vs Helpfulness',encoded)

    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content                
        }]
    }

    
    visualization_output = namedtuple('VisualizationOutput', [ 'mlpipeline_ui_metadata'])
    return visualization_output(json.dumps(metadata))        
            
   

In [10]:
 def sentiment_count_visualize_data(log_folder:str,IMAGE_HTML_TEMPLATE:str,HTML_TEMPLATE:str)-> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):

    import joblib
    import os
    os.system('pip install plotly')
    os.system('pip install cufflinks')
    os.system('pip install seaborn')
    os.system('pip install matplotlib')
    import pandas as pd 
    import numpy as np 
    import matplotlib.pyplot as plt 
    from matplotlib import rcParams
    import seaborn as sns
    from plotly import tools
    import plotly.graph_objs as go
    from plotly.offline import iplot
    from collections import namedtuple
    import base64
    from io import BytesIO
    import json
    
    process_reviews = joblib.load(open(log_folder + '/process_reviews.pkl','rb'))
    
    #plot layout
    plt.rcParams.update({'font.size': 18})
    rcParams['figure.figsize'] = 16,9

    # Creating dataframe and removing 0 helpfulrate records
    senti_help= pd.DataFrame(process_reviews, columns = ['sentiment', 'helpful_rate'])
    senti_help = senti_help[senti_help['helpful_rate'] != 0.00] 
    ax = process_reviews.groupby(['year','sentiment'])['sentiment'].count().unstack().plot(legend=True)
    ax.set_title('Year and Sentiment count')
    ax.set_xlabel('Year')
    ax.set_ylabel('Sentiment count')
    fig = ax.get_figure()
    
    #fig = plt.figure()
    tmpfile = BytesIO()
    fig.savefig(tmpfile, format='png')
    encoded = base64.b64encode(tmpfile.getvalue()).decode('utf-8')
    html_content = HTML_TEMPLATE % IMAGE_HTML_TEMPLATE.format('Year and Sentiment count',encoded) 

    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content              
        }]
    }
   
    visualization_output = namedtuple('VisualizationOutput', [ 'mlpipeline_ui_metadata'])
    return visualization_output(json.dumps(metadata))

In [11]:
    
    #Creating a dataframe
    

In [12]:
def reviews_count_visualize_data(log_folder:str,IMAGE_HTML_TEMPLATE:str,HTML_TEMPLATE:str)-> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    import numpy as np
    import joblib
    import os
    os.system('pip install plotly')
    os.system('pip install cufflinks')
    os.system('pip install seaborn')
    os.system('pip install matplotlib')
    import pandas as pd 
    import numpy as np 
    import matplotlib.pyplot as plt 
    from matplotlib import rcParams
    import seaborn as sns
    from plotly import tools
    import plotly.graph_objs as go
    from plotly.offline import iplot
    from collections import namedtuple
    import base64
    from io import BytesIO
    import json
    
    process_reviews = joblib.load(open(log_folder + '/process_reviews.pkl','rb'))
    
    #plot layout
    plt.rcParams.update({'font.size': 18})
    rcParams['figure.figsize'] = 16,9

    # Creating dataframe and removing 0 helpfulrate records
    senti_help= pd.DataFrame(process_reviews, columns = ['sentiment', 'helpful_rate'])
    senti_help = senti_help[senti_help['helpful_rate'] != 0.00] 
    day=pd.DataFrame(process_reviews.groupby('day')['reviews'].count()).reset_index()
    day['day']=day['day'].astype('int64')
    day.sort_values(by=['day'])

    #Plotting the graph
    ax = sns.barplot(x="day", y="reviews", data=day)
    ax.set_title('Day vs Reviews count')
    ax.set_xlabel('Day')
    ax.set_ylabel('Reviews count')
    fig = ax.get_figure()
    
#     plt.title('Day vs Reviews count')
#     plt.xlabel('Day')
#     plt.ylabel('Reviews count')
#     plt.show()
    
#     fig = plt.figure()
    tmpfile = BytesIO()
    fig.savefig(tmpfile, format='png')
    encoded = base64.b64encode(tmpfile.getvalue()).decode('utf-8')
    html_content = HTML_TEMPLATE % IMAGE_HTML_TEMPLATE.format('Year and Sentiment count',encoded) 

    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content              
        }]
    }
   
    visualization_output = namedtuple('VisualizationOutput', [ 'mlpipeline_ui_metadata'])
    return visualization_output(json.dumps(metadata))
    

In [13]:
def new_features(log_folder:str)-> NamedTuple('Outputs', [('logdir',str)]):
    import numpy as np
    import joblib
    import os
    import pandas as pd 
    os.system('pip install textblob')
    from textblob import TextBlob
 
    process_reviews = joblib.load(open(log_folder + '/process_reviews.pkl','rb'))
    process_reviews['polarity'] = process_reviews['reviews'].map(lambda text: TextBlob(text).sentiment.polarity)
    process_reviews['review_len'] = process_reviews['reviews'].astype(str).apply(len)
    process_reviews['word_count'] = process_reviews['reviews'].apply(lambda x: len(str(x).split()))
    print(process_reviews.head())
    print(process_reviews.columns)
    
    joblib.dump(process_reviews, log_folder + '/process_reviews.pkl')
    return ([log_folder]) 

In [14]:
def polarity_visualization(log_folder:str,HTML_TEMPLATE:str,IMAGE_HTML_TEMPLATE:str)-> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    import numpy as np
    import joblib
    import json
    import os
    os.system('pip install plotly')
    os.system('pip install cufflinks')
    os.system('pip install -U kaleido')
    import pandas as pd 
    import numpy as np 
    from plotly import tools
    import plotly.graph_objs as go
    from plotly.offline import plot
    from scipy import interp
    from itertools import cycle
    import cufflinks as cf
    from collections import namedtuple
#     cf.go_offline()
#     cf.set_config_file(offline=False, world_readable=True)
    import base64
    from io import BytesIO
    
    process_reviews = joblib.load(open(log_folder + '/process_reviews.pkl','rb'))
    

    my_data = [go.Histogram(x = process_reviews['polarity'])]
    my_layout = go.Layout({"title": 'Sentiment Polarity Distribution',
                           "yaxis": {"title":"count"},
                           "xaxis": {"title":"polarity"},
                           "showlegend": False}, yaxis=go.layout.YAxis(
            tickmode='array',
            automargin=True,
        )
        )

    fig = go.Figure(data = my_data, layout = my_layout)
    #img_bytes = fig.to_image(format="png")
    #encoding = base64.b64encode(img_bytes).decode('utf-8')
    #polarity = IMAGE_HTML_TEMPLATE.format(encoding) #fig.to_html(full_html=True) #"
    polarity = fig.to_html(full_html=True)
    
    
    my_data = [go.Histogram(x = process_reviews['overall'])]
    my_layout = go.Layout({"title": 'Review Rating Distribution',
                           "yaxis": {"title":"count"},
                           "xaxis": {"title":"Rating"},
                           "showlegend": False}, yaxis=go.layout.YAxis(
            tickmode='array',
            automargin=True,
        )
        )


    fig = go.Figure(data = my_data, layout = my_layout)
    #tmpfile = BytesIO()
    #fig.savefig(tmpfile, format='png')
    #img_bytes = fig.to_image(format="png")
    #encoding = base64.b64encode(tmpfile.getvalue()).decode('utf-8')
    #rating = IMAGE_HTML_TEMPLATE.format(encoding)
    rating = fig.to_html(full_html=True)
    
    
    
    my_data = [go.Histogram(x = process_reviews['review_len'])]
    my_layout = go.Layout({"title": "Review Text Length Distribution",
                           "yaxis": {"title":"count"},
                           "xaxis": {"title":"review length"},
                           "showlegend": False}, yaxis=go.layout.YAxis(
            tickmode='array',
            automargin=True,
        )
        )


    fig = go.Figure(data = my_data, layout = my_layout)
    #img_bytes = fig.to_image(format="png")
    #encoding = base64.b64encode(img_bytes.getvalue()).decode('utf-8')
    #text = IMAGE_HTML_TEMPLATE.format(encoding)
    text = fig.to_html(full_html=True) #"data:image/png;base64," + encoding

    
    my_data = [go.Histogram(x = process_reviews['word_count'])]
    my_layout = go.Layout({"title": "Review Text Word Count Distribution",
                           "yaxis": {"title":"count"},
                           "xaxis": {"title":"word count"},
                           "showlegend": False}, yaxis=go.layout.YAxis(
            tickmode='array',
            automargin=True,
        )
        )


    fig = go.Figure(data = my_data, layout = my_layout)
    #img_bytes = fig.to_image(format="png")
    #encoding = base64.b64encode(img_bytes).decode()
    #word = "data:image/png;base64," + encoding
    word = fig.to_html(full_html=True)   
    
    html_content = HTML_TEMPLATE % polarity+rating+text+word
    print(html_content[:100])
    
    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content                #'<h1>Hello, World!</h1>',
        }]
    }
   
    visualization_output = namedtuple('VisualizationOutput', [ 'mlpipeline_ui_metadata'])
    return visualization_output(json.dumps(metadata))
    

In [15]:
def ngram_analysis(log_folder:str,HTML_TEMPLATE:str)-> NamedTuple('Outputs', [('logdir',str),('mlpipeline_ui_metadata', 'UI_metadata')]):
    #Filtering data
    import numpy as np
    import joblib
    import os
    os.system('pip install plotly')
    os.system('pip install cufflinks')
    os.system('pip install seaborn')
    os.system('pip install matplotlib')
    os.system('pip install wordcloud')
    import pandas as pd 
    import matplotlib.pyplot as plt 
    from matplotlib import rcParams
    import seaborn as sns
    from plotly import tools
    import plotly.graph_objs as go
    from plotly.offline import iplot
    from collections import namedtuple
    from collections import defaultdict,Counter
    import base64
    from io import BytesIO
    import json
    from wordcloud import WordCloud,STOPWORDS
    process_reviews = joblib.load(open(log_folder + '/process_reviews.pkl','rb'))
    
    
    review_pos = process_reviews[process_reviews["sentiment"]=='Positive'].dropna()
    review_neu = process_reviews[process_reviews["sentiment"]=='Neutral'].dropna()
    review_neg = process_reviews[process_reviews["sentiment"]=='Negative'].dropna()
    
    print("Positive: ",review_pos.shape)
    print("Neutral: ",review_neu.shape)
    print("Negative: ",review_neg.shape)
    ## custom function for ngram generation ##
    def generate_ngrams(text, n_gram=1):
        token = [token for token in text.lower().split(" ") if token != "" if token not in STOPWORDS]
        ngrams = zip(*[token[i:] for i in range(n_gram)])
        return [" ".join(ngram) for ngram in ngrams]

    ## custom function for horizontal bar chart ##
    print("monogram started")
    print(review_pos.head())
    def horizontal_bar_chart(df, color):
        trace = go.Bar(
            y=df["word"].values[::-1],
            x=df["wordcount"].values[::-1],
            showlegend=False,
            orientation = 'h',
            marker=dict(
                color=color,
            ),
        )
        return trace

    ## Get the bar chart from positive reviews ##
    freq_dict = defaultdict(int)
    for sent in review_pos["reviews"]:
        for word in generate_ngrams(sent):
            freq_dict[word] += 1
    print(freq_dict)        
    fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
    print(fd_sorted)
    
    fd_sorted.columns = ["word", "wordcount"]
    trace0 = horizontal_bar_chart(fd_sorted.head(25), 'green')

    ## Get the bar chart from neutral reviews ##
    
    freq_dict = defaultdict(int)
    for sent in review_neu["reviews"]:
        for word in generate_ngrams(sent):
            freq_dict[word] += 1
    fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
    
    fd_sorted.columns = ["word", "wordcount"]
    trace1 = horizontal_bar_chart(fd_sorted.head(25), 'grey')

    ## Get the bar chart from negative reviews ##
    freq_dict = defaultdict(int)
    for sent in review_neg["reviews"]:
        for word in generate_ngrams(sent):
            freq_dict[word] += 1
    fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
    
    fd_sorted.columns = ["word", "wordcount"]
    trace2 = horizontal_bar_chart(fd_sorted.head(25), 'red')

    # Creating two subplots
    fig = tools.make_subplots(rows=3, cols=1, vertical_spacing=0.04,
                              subplot_titles=["Frequent words of positive reviews", "Frequent words of neutral reviews",
                                              "Frequent words of negative reviews"])
    fig.append_trace(trace0, 1, 1)
    fig.append_trace(trace1, 2, 1)
    fig.append_trace(trace2, 3, 1)
    fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
    monogram = fig.to_html(full_html=True)
    print("monogram ended")
    
    ### Bigrams######
    print("bigram started")
    freq_dict = defaultdict(int)
    for sent in review_pos["reviews"]:
        for word in generate_ngrams(sent,2):
            freq_dict[word] += 1
    fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
    
    fd_sorted.columns = ["word", "wordcount"]
    trace0 = horizontal_bar_chart(fd_sorted.head(25), 'green')

    ## Get the bar chart from neutral reviews ##
    freq_dict = defaultdict(int)
    for sent in review_neu["reviews"]:
        for word in generate_ngrams(sent,2):
            freq_dict[word] += 1
    fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
    
    fd_sorted.columns = ["word", "wordcount"]
    trace1 = horizontal_bar_chart(fd_sorted.head(25), 'grey')

    ## Get the bar chart from negative reviews ##
    freq_dict = defaultdict(int)
    for sent in review_neg["reviews"]:
        for word in generate_ngrams(sent,2):
            freq_dict[word] += 1
    fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
    
    fd_sorted.columns = ["word", "wordcount"]
    trace2 = horizontal_bar_chart(fd_sorted.head(25), 'brown')



    # Creating two subplots
    fig = tools.make_subplots(rows=3, cols=1, vertical_spacing=0.04,horizontal_spacing=0.25,
                              subplot_titles=["Bigram plots of Positive reviews", 
                                              "Bigram plots of Neutral reviews",
                                              "Bigram plots of Negative reviews"
                                              ])
    fig.append_trace(trace0, 1, 1)
    fig.append_trace(trace1, 2, 1)
    fig.append_trace(trace2, 3, 1)


    fig['layout'].update(height=1000, width=800, paper_bgcolor='rgb(233,233,233)', title="Bigram Plots")
    bigram = fig.to_html(full_html=True)
    print("bigram ended")
    
    print("trigram started")
    for sent in review_pos["reviews"]:
        for word in generate_ngrams(sent,3):
            freq_dict[word] += 1
    fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
    
    fd_sorted.columns = ["word", "wordcount"]
    trace0 = horizontal_bar_chart(fd_sorted.head(25), 'green')

    ## Get the bar chart from neutral reviews ##
    freq_dict = defaultdict(int)
    for sent in review_neu["reviews"]:
        for word in generate_ngrams(sent,3):
            freq_dict[word] += 1
    fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
    
    fd_sorted.columns = ["word", "wordcount"]
    trace1 = horizontal_bar_chart(fd_sorted.head(25), 'grey')

    ## Get the bar chart from negative reviews ##
    freq_dict = defaultdict(int)
    for sent in review_neg["reviews"]:
        for word in generate_ngrams(sent,3):
            freq_dict[word] += 1
    fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
    
    fd_sorted.columns = ["word", "wordcount"]
    trace2 = horizontal_bar_chart(fd_sorted.head(25), 'red')

    # Creating two subplots
    fig = tools.make_subplots(rows=3, cols=1, vertical_spacing=0.04, horizontal_spacing=0.05,
                              subplot_titles=["Tri-gram plots of Positive reviews", 
                                              "Tri-gram plots of Neutral reviews",
                                              "Tri-gram plots of Negative reviews"])
    fig.append_trace(trace0, 1, 1)
    fig.append_trace(trace1, 2, 1)
    fig.append_trace(trace2, 3, 1)
    fig['layout'].update(height=1200, width=1200, paper_bgcolor='rgb(233,233,233)', title="Trigram Count Plots")
    trigram = fig.to_html(full_html=True)
    print("trigram ended")
    
    html_content = HTML_TEMPLATE % monogram+bigram+trigram
    
    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage':'inline',
            'source': html_content                #'<h1>Hello, World!</h1>',
        }]
    }

   
    visualization_output = namedtuple('VisualizationOutput', ['logdir', 'mlpipeline_ui_metadata'])
    
    joblib.dump(review_pos, log_folder + '/review_pos.pkl')
    joblib.dump(review_neu, log_folder + '/review_neu.pkl')
    joblib.dump(review_neg, log_folder + '/review_neg.pkl')
    
    return visualization_output(log_folder,json.dumps(metadata))

In [16]:
def word_cloud_positive_reviews(log_folder:str,HTML_TEMPLATE:str,IMAGE_HTML_TEMPLATE:str)-> NamedTuple('Output', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    import numpy as np
    import joblib
    import os
    os.system('pip install plotly')
    os.system('pip install cufflinks')
    os.system('pip install seaborn')
    os.system('pip install matplotlib')
    os.system('pip install wordcloud')
    import pandas as pd 
    import numpy as np 
    import matplotlib.pyplot as plt 
    from matplotlib import rcParams
    import seaborn as sns
    from plotly import tools
    import plotly.graph_objs as go
    from plotly.offline import iplot
    from collections import namedtuple
    import base64
    from wordcloud import WordCloud,STOPWORDS
    from io import BytesIO
    import json
    import urllib
    
    review_pos = joblib.load(open(log_folder + '/review_pos.pkl','rb'))
    
    
    text = review_pos["reviews"]
    wordcloud = WordCloud(
        width = 300,
        height = 200,
        background_color = 'black',
        stopwords = STOPWORDS).generate(str(text))
    fig = plt.figure(
        figsize = (20, 20),
        facecolor = 'k',
        edgecolor = 'k')
    plt.imshow(wordcloud, interpolation = 'bilinear')
    
    tmpfile = BytesIO()
    plt.savefig(tmpfile, format='png')
    tmpfile.seek(0)  # rewind the data
    string = base64.b64encode(tmpfile.read())
    
    positive = IMAGE_HTML_TEMPLATE.format("Positive Word Cloud",urllib.parse.quote(string))
    plt.axis('off')
    plt.tight_layout(pad=0)
    
    html_content = HTML_TEMPLATE % positive 
    
    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content                #'<h1>Hello, World!</h1>',
        }]
    }

   
    visualization_output = namedtuple('VisualizationOutput', [ 'mlpipeline_ui_metadata'])
    
    return visualization_output(json.dumps(metadata))
    

In [17]:
def word_cloud_neutral_reviews(log_folder:str,HTML_TEMPLATE:str,IMAGE_HTML_TEMPLATE:str)-> NamedTuple('Output', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    
    import numpy as np
    import joblib
    import os
    os.system('pip install plotly')
    os.system('pip install cufflinks')
    os.system('pip install seaborn')
    os.system('pip install matplotlib')
    os.system('pip install wordcloud')
    import pandas as pd 
    import numpy as np 
    import matplotlib.pyplot as plt 
    from matplotlib import rcParams
    import seaborn as sns
    from plotly import tools
    import plotly.graph_objs as go
    from plotly.offline import iplot
    from collections import namedtuple
    import base64
    from wordcloud import WordCloud,STOPWORDS
    from io import BytesIO
    import json
    import urllib
    
    review_neu = joblib.load(open(log_folder + '/review_neu.pkl','rb'))
    text = review_neu["reviews"]
    wordcloud = WordCloud(
        width = 300,
        height = 200,
        background_color = 'black',
        stopwords = STOPWORDS).generate(str(text))
    fig = plt.figure(
        figsize = (20, 20),
        facecolor = 'k',
        edgecolor = 'k')
    plt.imshow(wordcloud, interpolation = 'bilinear')
    tmpfile = BytesIO()
    plt.savefig(tmpfile, format='png')
    tmpfile.seek(0)  # rewind the data
    string = base64.b64encode(tmpfile.read())
    
    positive = IMAGE_HTML_TEMPLATE.format("Neutal word Cloud ",urllib.parse.quote(string))
    plt.axis('off')
    plt.tight_layout(pad=0)
    
    html_content = HTML_TEMPLATE % positive 
    
    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content                #'<h1>Hello, World!</h1>',
        }]
    }
  
    visualization_output = namedtuple('VisualizationOutput', [ 'mlpipeline_ui_metadata'])
    
    return visualization_output(json.dumps(metadata))

In [18]:
def word_cloud_negative_reviews(log_folder:str,HTML_TEMPLATE:str,IMAGE_HTML_TEMPLATE:str)-> NamedTuple('Output', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    import numpy as np
    import joblib
    import os
    os.system('pip install plotly')
    os.system('pip install cufflinks')
    os.system('pip install seaborn')
    os.system('pip install matplotlib')
    os.system('pip install wordcloud')
    import pandas as pd 
    import numpy as np 
    import matplotlib.pyplot as plt 
    from matplotlib import rcParams
    import seaborn as sns
    from plotly import tools
    import plotly.graph_objs as go
    from plotly.offline import iplot
    from collections import namedtuple
    import base64
    from wordcloud import WordCloud,STOPWORDS
    from io import BytesIO
    import json
    import urllib
    
    review_neg = joblib.load(open(log_folder + '/review_neg.pkl','rb'))
    stop_words =joblib.load(open(log_folder + '/stop_words.pkl','rb'))                  
    
    text = review_neg["reviews"]
    wordcloud = WordCloud(
        width = 300,
        height = 200,
        background_color = 'black',
        stopwords = stop_words).generate(str(text))   
    
    tmpfile = BytesIO()
    plt.savefig(tmpfile, format='png')
    tmpfile.seek(0)  # rewind the data
    string = base64.b64encode(tmpfile.read())
    
    positive = IMAGE_HTML_TEMPLATE.format("Negative word cloud ",urllib.parse.quote(string))
    plt.axis('off')
    plt.tight_layout(pad=0)
    
    html_content = HTML_TEMPLATE % positive
    
    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content                #'<h1>Hello, World!</h1>',
        }]
    }
   
    visualization_output = namedtuple('VisualizationOutput', [ 'mlpipeline_ui_metadata'])
    
    return visualization_output(json.dumps(metadata))

In [19]:
def target_encoding(log_folder:str) -> NamedTuple('Outputs', [('logdir',str)]):
   
    
    import numpy as np
    import joblib
    import os
    import pandas as pd 
    import json
    from sklearn.preprocessing import LabelEncoder
    
    
    
    
    
    process_reviews = joblib.load(open(log_folder + '/process_reviews.pkl','rb'))
    # calling the label encoder function
    
    
    
    label_encoder = LabelEncoder() 

    # Encode labels in column 'sentiment'. 
    process_reviews['sentiment']= label_encoder.fit_transform(process_reviews['sentiment']) 

    print(process_reviews['sentiment'].unique()) 
    print(process_reviews['sentiment'].value_counts())
    
    joblib.dump(process_reviews, log_folder + '/process_reviews.pkl')
    
    return ([log_folder])

In [20]:
def stemming(log_folder:str) -> NamedTuple('Outputs', [('logdir',str)]):
    
    import numpy as np
    import joblib
    import os
    import pandas as pd 
    import json
    import re
    from nltk.stem.porter import PorterStemmer
    
    process_reviews = joblib.load(open(log_folder + '/process_reviews.pkl','rb'))
    stop_words = joblib.load(open(log_folder + '/stop_words.pkl','rb'))
    #Extracting 'reviews' for processing
    review_features=process_reviews.copy()
    review_features=review_features[['reviews']].reset_index(drop=True)
    print(review_features.head())
    #Performing stemming on the review dataframe
    ps = PorterStemmer()

    #splitting and adding the stemmed words except stopwords
    corpus = []
    for i in range(0, len(review_features)):
        review = re.sub('[^a-zA-Z]', ' ', review_features['reviews'][i])
        review = review.split()
        review = [ps.stem(word) for word in review if not word in stop_words]
        review = ' '.join(review)
        corpus.append(review)  
        
    joblib.dump(process_reviews, log_folder + '/process_reviews.pkl')
    joblib.dump(review_features, log_folder + '/review_features.pkl')
    return ([log_folder])    

In [21]:
def tfidf(log_folder:str) -> NamedTuple('Outputs', [('logdir',str)]):
    
    import numpy as np
    import joblib
    import os
    import pandas as pd 
    import json
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    review_features = joblib.load(open(log_folder + '/review_features.pkl','rb'))
    process_reviews = joblib.load(open(log_folder + '/process_reviews.pkl','rb'))
    
    tfidf_vectorizer = TfidfVectorizer(max_features=5000,ngram_range=(2,2))
    # TF-IDF feature matrix
    X= tfidf_vectorizer.fit_transform(review_features['reviews'])
    print(X.shape)
    #Getting the target variable(encoded)
    y=process_reviews['sentiment']
    
    joblib.dump(X, log_folder + '/X.pkl')
    joblib.dump(y, log_folder + '/y.pkl')
    return ([log_folder]) 

In [22]:
def handling_imbalance_data(log_folder:str) -> NamedTuple('Outputs', [('logdir',str)]):
    import numpy as np
    import joblib
    import os
    import pandas as pd 
    import json
    from collections import defaultdict,Counter
    os.system('pip install imblearn')
    from imblearn.over_sampling import SMOTE
    
    X = joblib.load(open(log_folder + '/X.pkl','rb'))
    y = joblib.load(open(log_folder + '/y.pkl','rb'))
    print(f'Original dataset shape : {Counter(y)}')

    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X, y)

    print(f'Resampled dataset shape {Counter(y_res)}')
    
    joblib.dump(X_res, log_folder + '/X_res.pkl')
    joblib.dump(y_res, log_folder + '/y_res.pkl')
    
    return ([log_folder])
    

In [23]:
def data_spliting(log_folder:str) -> NamedTuple('Outputs', [('logdir',str)]):
    import numpy as np
    import joblib
    import os
    import pandas as pd 
    import json
    from collections import defaultdict,Counter
    #from imblearn.over_sampling import SMOTE
    from sklearn.model_selection import train_test_split
    
    X_res = joblib.load(open(log_folder + '/X_res.pkl','rb'))
    y_res = joblib.load(open(log_folder + '/y_res.pkl','rb'))
    ## Divide the dataset into Train and Test
    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.25, random_state=0)
    
    joblib.dump(X_train, log_folder + '/X_train.pkl')
    joblib.dump(X_test, log_folder + '/X_test.pkl')
    joblib.dump(y_train, log_folder + '/y_train.pkl')
    joblib.dump(y_test, log_folder + '/y_test.pkl')
    
    return ([log_folder])
    

In [24]:
def model_selection(log_folder:str) -> NamedTuple('Outputs', [('logdir',str)]):
    import numpy as np
    import joblib
    import os
    import pandas as pd 
    import json
    from collections import defaultdict,Counter
    from sklearn.linear_model import LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.naive_bayes import BernoulliNB 
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    from sklearn.model_selection import cross_val_score
    
    X = joblib.load(open(log_folder + '/X_res.pkl','rb'))
    y = joblib.load(open(log_folder + '/y_res.pkl','rb'))
    #creating the objects
    logreg_cv = LogisticRegression(random_state=0)
    dt_cv=DecisionTreeClassifier()
    knn_cv=KNeighborsClassifier()
    svc_cv=SVC()
    nb_cv=BernoulliNB()
    rf_cv = RandomForestClassifier()
    cv_dict = {0: 'Logistic Regression', 1: 'Decision Tree',2:'KNN',3:'SVC',4:'Naive Bayes',5:'Random Forest'}
    cv_models=[logreg_cv,dt_cv,knn_cv,svc_cv,nb_cv,rf_cv]


    for i,model in enumerate(cv_models):
        print("{} Test Accuracy: {}".format(cv_dict[i],cross_val_score(model, X, y, cv=10, scoring ='accuracy').mean()))
    
    joblib.dump(logreg_cv, log_folder + '/logreg_cv.pkl')
    joblib.dump(dt_cv, log_folder + '/dt_cv.pkl')
    joblib.dump(knn_cv, log_folder + '/knn_cv.pkl')
    joblib.dump(svc_cv, log_folder + '/svc_cv.pkl')
    joblib.dump(nb_cv, log_folder + '/nb_cv.pkl')
    joblib.dump(rf_cv, log_folder + '/rf_cv.pkl')
    
    return ([log_folder])    

In [25]:
def hyperparameter_tuning(log_folder:str) -> NamedTuple('Outputs', [('logdir',str)]):
    import numpy as np
    import joblib
    import os
    import pandas as pd 
    import json
    from collections import defaultdict,Counter
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import GridSearchCV
    
    X_train = joblib.load(open(log_folder + '/X_train.pkl','rb'))
    X_test = joblib.load(open(log_folder + '/X_test.pkl','rb'))
    y_train = joblib.load(open(log_folder + '/y_train.pkl','rb'))
    y_test = joblib.load(open(log_folder + '/y_test.pkl','rb'))
    
    param_grid = {'C': np.logspace(-4, 4, 50), 'penalty':['l1', 'l2']}
    clf = GridSearchCV(LogisticRegression(random_state=0), param_grid,cv=5, verbose=0,n_jobs=-1)
    best_model = clf.fit(X_train,y_train)
    print(best_model.best_estimator_)
    print("The mean accuracy of the model is:",best_model.score(X_test,y_test))
    logreg = LogisticRegression(C=10000.0, random_state=0)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
    
    joblib.dump(y_pred, log_folder + '/y_pred.pkl')
    
    return ([log_folder]) 

In [26]:
def plot_metrics(log_folder:str,HTML_TEMPLATE:str,IMAGE_HTML_TEMPLATE:str,TEXT_HTML_TEMPLATE:str)-> NamedTuple('Output', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    
    import numpy as np
    import joblib
    import os
    import pandas as pd 
    import json
    from collections import namedtuple
    os.system('pip install seaborn')
    os.system('pip install matplotlib')
    from sklearn import metrics
    from sklearn.metrics import classification_report
    import matplotlib.pyplot as plt 
    from matplotlib import rcParams
    import seaborn as sns
    from io import BytesIO
    from sklearn.model_selection import train_test_split
    import base64
    
    y_test = joblib.load(open(log_folder + '/y_test.pkl','rb'))
    y_pred = joblib.load(open(log_folder + '/y_pred.pkl','rb'))
    
    classes=['Negative','Neutral','Positive']
    normalize=False
    title='Confusion matrix'
    cmap=plt.cm.Blues  
     
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    cm = metrics.confusion_matrix(y_test, y_pred)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i in range (cm.shape[0]):
        for j in range (cm.shape[1]):
            plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
            
    ax = plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
            
    print("Classification Report:\n",classification_report(y_test, y_pred))
    report  = TEXT_HTML_TEMPLATE.format(classification_report(y_test, y_pred))
    fig = ax.get_figure()
    tmpfile = BytesIO()
    fig.savefig(tmpfile, format='png')
    encoded = base64.b64encode(tmpfile.getvalue()).decode('utf-8')
    html_content = HTML_TEMPLATE % report+IMAGE_HTML_TEMPLATE.format(title,encoded)
    
    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content                #'<h1>Hello, World!</h1>',
        }]
    }
   
    visualization_output = namedtuple('VisualizationOutput', [ 'mlpipeline_ui_metadata'])
    
    return visualization_output(json.dumps(metadata))

In [27]:
# from sklearn import metrics
# import matplotlib.pyplot as plt
# import numpy as np
# cm = metrics.confusion_matrix([1,1,1,0,0,0], [1,0,1,0,1,0])
# classes=['Negative','Neutral','Positive']
# cmap=plt.cm.Blues
# ax = plt.imshow(cm, interpolation='nearest', cmap=cmap)
# ax.

In [28]:
def one_vs_all_and_roc_auc_curve(log_folder:str,HTML_TEMPLATE:str,IMAGE_HTML_TEMPLATE:str)-> NamedTuple('Output', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    
    #Binarizing the target feature
    import numpy as np
    import joblib
    import os
    os.system('pip install seaborn')
    os.system('pip install matplotlib')
    from collections import namedtuple
    import pandas as pd 
    from sklearn.preprocessing import label_binarize
    from itertools import cycle
    import json
    from sklearn import metrics
    from sklearn.metrics import classification_report
    import matplotlib.pyplot as plt 
    from matplotlib import rcParams
    import seaborn as sns
    from scipy import interp
    from sklearn.svm import SVC
    from sklearn.model_selection import cross_val_score
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.metrics import roc_auc_score
    from sklearn.metrics import roc_curve, auc
    from io import BytesIO
    import base64
    from sklearn.model_selection import train_test_split
    
#     y_test = joblib.load(open(log_folder + '/y_test.pkl','rb'))
#     y_pred = joblib.load(open(log_folder + '/y_pred.pkl','rb')
    
    X = joblib.load(open(log_folder + '/X_res.pkl','rb'))
    y = joblib.load(open(log_folder + '/y_res.pkl','rb'))
    
    
    
    y = label_binarize(y, classes=[0, 1, 2])
    n_classes = y.shape[1]

    #Train-Test split(80:20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,
                                                        random_state=0)

    #OneVsRestClassifier
    classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True,
                                     random_state=10))
    y_score = classifier.fit(X_train, y_train).decision_function(X_test)

    #Computing TPR and FPR
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    fig,ax = plt.subplots(1, figsize=(8, 6))
    ax.plot(fpr["micro"], tpr["micro"],label='micro-average ROC curve (area = {0:0.2f})'''.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4)

    ax.plot(fpr["macro"], tpr["macro"],label='macro-average ROC curve (area = {0:0.2f})'''.format(roc_auc["macro"]), color='navy', linestyle=':', linewidth=4)
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    
    for i, color in zip(range(n_classes), colors):
        ax.plot(fpr[i], tpr[i], color=color, lw=4,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(i, roc_auc[i]))

    ax.plot([0, 1], [0, 1], 'k--', lw=4)
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver operating characteristic to multi-class')
    ax.legend(loc="lower right")
    
    fig = ax.get_figure()
    tmpfile = BytesIO()
    fig.savefig(tmpfile, format='png')
    encoded = base64.b64encode(tmpfile.getvalue()).decode('utf-8')
    html_content = HTML_TEMPLATE % IMAGE_HTML_TEMPLATE.format('Receiver operating characteristic to multi-class',encoded)
    
    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content               
        }]
    }

   
    visualization_output = namedtuple('VisualizationOutput', [ 'mlpipeline_ui_metadata'])
    
    return visualization_output(json.dumps(metadata))

In [29]:
@dsl.pipeline(
    name='Amazon Products Review',
    description='Analysis the sentiments of reviews provided by customer to amazon products.'
)

def amazon_reviews(data_path):
    import time
    #data_path = 'Musical_instruments_reviews.csv'
    log_folder = '/kfp-private'
    pvc_name = "amazon-review-4000"
    HTML_TEMPLATE = '''
    <html><head>
        <style>
            table {
                border: none;
                border-collapse: collapse;
                border-spacing: 0;
                font-size: 14px;
            }
            td,
            th {
                text-align: right;
                vertical-align: middle;
                padding: 0.5em 0.5em;
                line-height: 1.0;
                white-space: nowrap;
                max-width: 100px;
                text-overflow: ellipsis;
                overflow: hidden;
                border: none;
            }
            th {
                font-weight: bold;
            }
            tbody tr:nth-child(odd) {
                background: rgb(245, 245, 245);
            }
        </style>
    </head>
    <body><div> </div>
    <div>
    %s
    </div></body>
    </html>
    '''
    TEXT_HTML_TEMPLATE = '''
            <div style="margin:10px 0;">
            <pre>
            {}
            </pre>
            </div>
            '''
    IMAGE_HTML_TEMPLATE = '''
            <div>
              <p>{}</p>
              <img src="data:image/png;base64, {}" />
            </div>
            '''
    image = "m10913018/nltk_env:2.3.0"
    
    vop = dsl.VolumeOp(
        name=pvc_name,
        resource_name="amazon-review-4000",
        size="1Gi",
        modes=dsl.VOLUME_MODE_RWO
    )
    
    read_data_op = func_to_container_op(
        func = read_data,
        base_image = image,
    )
    
    data_preprocessing_op = func_to_container_op(
        func = data_preprocessing,
        base_image = image,
    )
    
    feature_creation_op = func_to_container_op(
        func = feature_creation,
        base_image = image,
    )
    
    time_related_feature_creation_op = func_to_container_op(
        func = time_related_feature_creation,
        base_image = image,
    )
    
    feature_extraction_op = func_to_container_op(
        func = feature_extraction,
        base_image = image,
    )
    data_cleaning_op = func_to_container_op(
        func = data_cleaning,
        base_image = image,
    )
    
    sentiment_categories_visualize_data_op = func_to_container_op(
        func = sentiment_categories_visualize_data,
        base_image = image,
    )
    sentiment_count_visualize_data_op = func_to_container_op(
        func = sentiment_count_visualize_data,
        base_image = image,
    )
    reviews_count_visualize_data_op = func_to_container_op(
        func = reviews_count_visualize_data,
        base_image = image,
    )
    
    new_features_op = func_to_container_op(
        func = new_features,
        base_image = image,
    )
    
    
    polarity_visualization_op = func_to_container_op(
        func = polarity_visualization,
        base_image = image,
    )
    ngram_analysis_op = func_to_container_op(
        func = ngram_analysis,
        base_image = image,
    )
    word_cloud_positive_reviews_op = func_to_container_op(
        func = word_cloud_positive_reviews,
        base_image = image,
    )
    word_cloud_neutral_reviews_op = func_to_container_op(
        func = word_cloud_neutral_reviews,
        base_image = image,
    )
    
    word_cloud_negative_reviews_op = func_to_container_op(
        func = word_cloud_negative_reviews,
        base_image = image,
    )
    
    target_encoding_op = func_to_container_op(
        func = target_encoding,
        base_image = image,
    )
    stemming_op = func_to_container_op(
        func = stemming,
        base_image = image,
    )
    
    tfidf_op = func_to_container_op(
        func = tfidf,
        base_image = image,
    )
    
    handling_imbalance_data_op = func_to_container_op(
        func = handling_imbalance_data,
        base_image = image,
    )
    
    data_spliting_op = func_to_container_op(
        func = data_spliting,
        base_image = image,
    )
    model_selection_op = func_to_container_op(
        func = model_selection,
        base_image = image,
    )
    
    hyperparameter_tuning_op = func_to_container_op(
        func = hyperparameter_tuning,
        base_image = image,
    )
    
    plot_metrics_op = func_to_container_op(
        func = plot_metrics,
        base_image = image,
    )
    one_vs_all_and_roc_auc_curve_op = func_to_container_op(
        func = one_vs_all_and_roc_auc_curve,
        base_image = image,
    )
    
#     http_op = func_to_container_op(
#         func = http_port,
#         base_image = image,
#     )
    
    read_data_task = read_data_op(data_path,HTML_TEMPLATE,TEXT_HTML_TEMPLATE,log_folder).add_pvolumes({ log_folder:vop.volume, })
    
    data_preprocessing_task = data_preprocessing_op(read_data_task.outputs['logdir'],HTML_TEMPLATE,TEXT_HTML_TEMPLATE).add_pvolumes({ log_folder:vop.volume, })
    
    feature_creation_task = feature_creation_op(data_preprocessing_task.outputs['logdir'],HTML_TEMPLATE,TEXT_HTML_TEMPLATE).add_pvolumes({ log_folder:vop.volume, })
    
    time_related_feature_creation_task = time_related_feature_creation_op(feature_creation_task.outputs['logdir'],HTML_TEMPLATE,TEXT_HTML_TEMPLATE).add_pvolumes({ log_folder:vop.volume, })
    
    feature_extraction_task = feature_extraction_op(time_related_feature_creation_task.outputs['logdir'],HTML_TEMPLATE,TEXT_HTML_TEMPLATE).add_pvolumes({ log_folder:vop.volume, })
    
    data_cleaning_task = data_cleaning_op(feature_extraction_task.outputs['logdir'],HTML_TEMPLATE,TEXT_HTML_TEMPLATE).add_pvolumes({ log_folder:vop.volume, })
    
    
    sentiment_categories_visualize_data_task = sentiment_categories_visualize_data_op(data_cleaning_task.outputs['logdir'],IMAGE_HTML_TEMPLATE,HTML_TEMPLATE).add_pvolumes({ log_folder:vop.volume, })
    
    sentiment_count_visualize_data_task = sentiment_count_visualize_data_op(data_cleaning_task.outputs['logdir'],IMAGE_HTML_TEMPLATE,HTML_TEMPLATE).add_pvolumes({ log_folder:vop.volume, })
    
    reviews_count_visualize_data_task = reviews_count_visualize_data_op(data_cleaning_task.outputs['logdir'],IMAGE_HTML_TEMPLATE,HTML_TEMPLATE).add_pvolumes({ log_folder:vop.volume, })
       
    new_features_task = new_features_op(data_cleaning_task.outputs['logdir']).add_pvolumes({ log_folder:vop.volume, })
    
    polarity_visualization_task = polarity_visualization_op(new_features_task.outputs['logdir'],HTML_TEMPLATE,IMAGE_HTML_TEMPLATE).add_pvolumes({ log_folder:vop.volume, })
    
    ngram_analysis_task = ngram_analysis_op(new_features_task.outputs['logdir'],HTML_TEMPLATE).add_pvolumes({ log_folder:vop.volume, })
    
    word_cloud_positive_reviews_task = word_cloud_positive_reviews_op(ngram_analysis_task.outputs['logdir'],HTML_TEMPLATE,IMAGE_HTML_TEMPLATE).add_pvolumes({ log_folder:vop.volume, })
    word_cloud_neutral_reviews_task = word_cloud_neutral_reviews_op(ngram_analysis_task.outputs['logdir'],HTML_TEMPLATE,IMAGE_HTML_TEMPLATE).add_pvolumes({ log_folder:vop.volume, })
    word_cloud_negative_reviews_task = word_cloud_negative_reviews_op(ngram_analysis_task.outputs['logdir'],HTML_TEMPLATE,IMAGE_HTML_TEMPLATE).add_pvolumes({ log_folder:vop.volume, })
    
     
    #time.sleep(10)
    
    target_encoding_task = target_encoding_op(ngram_analysis_task.outputs['logdir']).add_pvolumes({ log_folder:vop.volume, })
    
    stemming_task = stemming_op(target_encoding_task.outputs['logdir']).add_pvolumes({ log_folder:vop.volume, })
    
    tfidf_task = tfidf_op(stemming_task.outputs['logdir']).add_pvolumes({ log_folder:vop.volume, })
    
    handling_imbalance_data_task = handling_imbalance_data_op(tfidf_task.outputs['logdir']).add_pvolumes({ log_folder:vop.volume, })
    
    data_spliting_task = data_spliting_op(handling_imbalance_data_task.outputs['logdir']).add_pvolumes({ log_folder:vop.volume, })
    
    model_selection_task = model_selection_op(data_spliting_task.outputs['logdir']).add_pvolumes({ log_folder:vop.volume, })
    
    hyperparameter_tuning_task = hyperparameter_tuning_op(model_selection_task.outputs['logdir']).add_pvolumes({ log_folder:vop.volume, })
    
    plot_metrics_task = plot_metrics_op(hyperparameter_tuning_task.outputs['logdir'],HTML_TEMPLATE,IMAGE_HTML_TEMPLATE,TEXT_HTML_TEMPLATE).add_pvolumes({ log_folder:vop.volume, })
    
    one_vs_all_and_roc_auc_curve_task = one_vs_all_and_roc_auc_curve_op(hyperparameter_tuning_task.outputs['logdir'],HTML_TEMPLATE,IMAGE_HTML_TEMPLATE).add_pvolumes({ log_folder:vop.volume, })
    
        

In [30]:
# import seaborn as sns
# ax = sns.violinplot()
# ax.set_xlabel("abcd")
# ax.set_title("dfdf")

In [31]:
#kfp.compiler.Compiler().compile(movie_reviews, 'cornell-1000.zip')
from datetime import datetime
EXPERIMENT_NAME = "amazon-products-review"
kfp_endpoint =  "http://ml-pipeline.kubeflow.svc.cluster.local:8888" 
arguments ={'data_path':'https://raw.githubusercontent.com/katonic-dev/Examples/master/data/Musical_instruments_reviews.csv'}
# data_path = '/kfp-project/Musical_instruments_reviews.csv'
pipeline_func = amazon_reviews
pipeline_filename = pipeline_func.__name__ + '.pipeline.yaml'


# pipeline_file_path = '/home/katonic/kfp-project/'

In [32]:
kfp.compiler.Compiler().compile(pipeline_func, pipeline_filename)
client = kfp.Client(kfp_endpoint)
experiment = client.create_experiment(EXPERIMENT_NAME)
#client.pipeline_uploads.upload_pipeline(uploadfile, **kwargs)
# pipeline_file = os.path.join(pipeline_file_path,'Musical_instruments_reviews.csv')
# print(pipeline_file)
run_name = pipeline_func.__name__ + str(datetime.now().strftime("%d-%m-%Y-%H-%M-%S"))
# pipeline = client.pipeline_uploads.upload_pipeline(pipeline_file, name=run_name)
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)

In [33]:
# import os
# import ipykernel
# os.path.basename(ipykernel.get_connection_file())