# TrikAI - The CookAI monster !

In [1]:
# modules for storing and plotting the information
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from wordcloud import WordCloud, STOPWORDS


# modules for web app
from flask import Flask, render_template,flash, redirect
from flask import request
from flask import url_for
from forms import Video_playlistForm, Consult
import requests

# modules for dAIogenes (web-scrapping youtube videos)
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import Playlist
import os 
from time import sleep
from random import randint


# modules for TrikAI (to synthetize texts)

# For clustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

# For topic extraction
import ktrain

# For level detection and doc_to_vect function
import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity



  from IPython.utils import traitlets as _traitlets


In [2]:
# definition of variables
path_Wordcloud = "./static/Wordcloud/"
path = "./"
path_figures = "./static/Figures/"
path_models = "./static/Models/"
path_temp = "./static/Temp/"



# Functions

#####################################################################################################################################
# WEBAPP

# Provide head for webapp pages
def head(title):
    
    return """<center><table class="default">
    <td><center><a href=\""""+ url_for('index')+""""\"><img src="""+url_for('static', filename="Arts/CookAI_monster_mini.png")+"""></center></td>
    <td><center><h1>"""+ str(title) +"""</h1></center></td>
    </table>"""

#####################################################################################################################################
# DAIOGENES

# Provide transcrip from a YouTube link
def get_transcript (link):
  id = link.split("https://www.youtube.com/watch?v=")[1]
  transcripcion = YouTubeTranscriptApi.get_transcript(id)
  transcript = ""
  for i in range(0,len(transcripcion)):
    transcript +=str(transcripcion[i]['text'])+" "
  return transcript  

# Provide name of the playlist and list of urls from the videos that belongs to the playlist
def get_playlist (link):
    try: 
        playlist = Playlist(link)
        name = playlist.title
    except: 
        playlist =[]
        name ="Error"
    return name, playlist

# Gives the lenght in words of a document
def document_size(doc):
    return len(doc)

# provides metadata from a YouTube video
def import_video_data(URL):
    page_source = requests.get(URL)
    page_source = page_source.text
    try: 
      title = page_source[page_source.find("<meta name=\"title\" content=\"") :page_source.find("<meta name=\"title\" content=\"")+200].split("<meta name=\"title\" content=\"")[1].split("\">")[0]     
    except: 
      title = "None"
    try: 
      rating = page_source[page_source.find("\"averageRating\":"): page_source.find("averageRating")+200].split("\"averageRating\":")[1].split(",\"allowRatings\"")[0]
    except:
      rating = "None"
    try: 
      views = page_source[page_source.find("\"viewCount\":"): page_source.find("\"viewCount\":")+200].split("\"viewCount\":")[1].split(",\"author\"")[0].replace("\"","")
    except:
      views = "None"
    try: 
      author = page_source[page_source.find("\"author\":"): page_source.find("\"author\":")+200].split("\"author\":")[1].split(",\"isPrivate\"")[0].replace("\"","")
    except:
      author = "None"
    sleep(randint(2,7))
    return URL, title, rating, views, author


#####################################################################################################################################
# STORING AND PLOTTING

# creation of png file with wordcloud
def gen_wordcloud(transcription,index):
    wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', max_words=100, width=853, height=300).generate(transcription)
    wordcloud.to_file(path_Wordcloud +str(index)+".png")
    
# creation of png file with wordcloud for the temp folder    
def gen_wordcloud_temp(transcription,index):
    wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', max_words=100, width=560, height=150).generate(transcription)
    wordcloud.to_file(path_temp+str(index)+".png")
    return "Temp/"+ str(index)+".png"


# visualize table 
def render_table(df, title=""):
    table = """<center><h3>"""+str(title)+"""</h3></center>"""
    table +="""<center><table class= \"default\">
    <tr>
    <th><center>Author</center></th>
    <th><center>Link</center></th>
    <th><center>Rating</center></th>
    <th><center>Level</center></th>"""
    
    
    for i, row in df.iterrows():
        table+="<tr><td><a href=\""+ url_for('query', column ="author", value = row["author"])+"\">"+row["author"]+"</a></td><td><a href=\""+ url_for('params', video_id= i)+"\">"+row["title"]+"</a></td><td>"+ str(row["rating"])+"</td><td>"+ str(row["level"])+"</td>"
        
        
    table +="</table></center>"
    return table

# render in html a generic table  ---> ME PARECE QUE NO LA USO 
def render_table_generic(df):
    columns = df.columns
    table ="""<center><table class= \"default\"><tr>"""
    for c in columns: 
        table+= "<th>"+c+"</th>"
    for i, row in df.iterrows():
        table +="<tr>"
        for c in columns:
            table+="<td>"+str(row[c])+"</td>"
        table +="<tr>"
    table +="</table></center>"
    return table 
 
# render in html a generic table with links    
def render_table_generic_links(df,title, cols=[]):
    if cols !=[]:
        cquery = cols[0]
        cvalue=cols[1]        
        df[str(cquery)+" links"]=df.apply(lambda x: """<a href= """ +"\""+ url_for('query',column=cquery,value =int(x[cvalue]))+"\""+"""">"""+x[cquery]+"""</a>""", axis = 1)
        df.drop(cols, axis=1, inplace=True) 
       
        
    columns = df.columns
    table = "<center><h3>"+str(title)+"</h3></center>"
    table +="""<center><table class= \"default\"><tr>"""
    for c in columns: 
        table+= "<th><center>"+c+"</center></th>"
    for i, row in df.iterrows():
        table +="<tr>"
        for c in columns:
            table+="<td>"+str(row[c])+"</td>"
        table +="<tr>"
    table +="</table></center>"
    return table  


#####################################################################################################################################
# TRIKAI-CLUSTERING

# returns the cluster of a set of keywords
def cluster_keywords (keyword):
    vectorizer = pickle.load(open(path_models+"tfidf.pickle", 'rb'))
    #vectorizer = TfidfVectorizer(stop_words='english')
    Y = vectorizer.transform([keyword])
    model = pickle.load(open(path_models +"cluster.pickle", 'rb'))
    prediction = model.predict(Y)
    # print ("Cluster number",prediction[0],"\n")
    return prediction[0]


# returns the centroids of a cluster given the model and the vectorizer
def cluster_centroids (model,vectorizer):
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    n_clusters = model.n_clusters
    terms =vectorizer.get_feature_names()
    centroid={}
    for cent in range(n_clusters):    
        words =[]
        for ind in order_centroids[cent, :10]:
            words.append(terms[ind])
        words_string = " ".join(words)
        centroid[cent] =words_string    
    return centroid


#####################################################################################################################################
# TRIKAI-TOPIC EXTRACTION

# load topic model 
def load_topic_model(fname):

    with open(fname+'.tm_vect', 'rb') as f:
        vectorizer = pickle.load(f)
    with open(fname+'.tm_model', 'rb') as f:
        model = pickle.load(f)
    with open(fname+'.tm_params', 'rb') as f:
        params = pickle.load(f)
    tm = ktrain.text.get_topic_model(n_topics=params['n_topics'],
                         n_features = params['n_features'],
                         verbose = params['verbose'])
    tm.model = model
    tm.vectorizer = vectorizer
    return tm


# topic identification from pretrained model 
def topic_identification(text):
   
    # Load the model pretrained
    
    fname =(path_models +"topic")
    
    with open(fname+'.tm_vect', 'rb') as f:
        vectorizer = pickle.load(f)
    with open(fname+'.tm_model', 'rb') as f:
        model = pickle.load(f)
    with open(fname+'.tm_params', 'rb') as f:
        params = pickle.load(f)
    tm = ktrain.text.get_topic_model(n_topics=params['n_topics'],
                         n_features = params['n_features'],
                         verbose = params['verbose'])
    tm.model = model
    tm.vectorizer = vectorizer

    return tm.predict([text]).argmax()


#####################################################################################################################################
# TRIKAI-LEVEL DETECTION AND DOC_TO_VECT

# load en_core_web_lg nlp model 
nlp =spacy.load("en_core_web_lg")


# returns the vector of a transcript based on spacy model en_core_web_lg
def doc_to_vect(transcript):
  doc = nlp(transcript)
  return doc.vector

# returns the level of difficulty (basic/advanced) of a AI transcript
def predict_level(nlp, texts): 
    # Use the model's tokenizer to tokenize each input text
    docs = [nlp.tokenizer(text) for text in texts]
    
    # Use textcat to get the scores for each doc
    textcat = nlp.get_pipe('textcat')
    scores = textcat.predict(docs)
    
    # From the scores, find the class with the highest score/probability
    predicted_class =  scores.argmax(axis=1)
    
    if predicted_class == 0:
        predicted_class_str ="basic"
    else: 
        predicted_class_str ="advanced"
    
    return predicted_class_str


#####################################################################################################################################
# TRIKAI-CORE-SEARCHER

# search function, provides a video_id
def search(consult_text,level):
    
    df = pd.read_csv("youtube_transcrip_vector.csv") 
    df.astype({"len": int, "views":int, "Cluster":int, "topic":int })
    df["title_agg"] = df[['title', 'author', 'type', 'playlist_name']].agg(' '.join, axis=1).apply(str.lower)
    
    levels =[]
        
    if level =="B":
        levels =  ["basic"]
    elif level =="A": 
        levels = ["intermediate","advanced"]
    else:
        levels =["basic","intermediate","advanced"]
        
    #print ("Level:" +str(levels))
        
    consult_len = len(consult_text.split())
        
    #print("Consult_len:"+str(consult_len))
 
    # to provide the best video_id including searchs in title,author, type, playlist name, cluster and topics    
    video_idtit = set()
        
    if consult_len <=3:            
        consult_list = consult_text.lower().split()
        for i in range(consult_len-1,3):
            consult_list.append("")
        title = df[df["level"].isin(levels)]["title_agg"].apply(lambda x: (consult_list[0] in x )&(consult_list[1] in x)&(consult_list[2] in x))
        video_idtit = set(title[title ==True].index.tolist())
                 
 
    cluster = cluster_keywords(consult_text)        
    video_idc =set (df[(df["Cluster"]==cluster)&(df["level"].isin(levels))].index.tolist())
    #print ("Cluster:"+str(cluster))
        
    topic = topic_identification(consult_text)    
    video_idt = set(df[(df["topic"]==topic)&(df["level"].isin(levels))].index.tolist())
    #print ("Topic:"+str(topic))
    #print ("video_idtit:"+str(video_idtit))
    #print ("video_cluster:"+str(video_idc))
    #print ("video_topic:"+ str(video_idt))
    
    if video_idtit != set():
        
        if video_idc & video_idt & video_idtit != set():
            video_ids = list((video_idc & video_idt & video_idtit))
            
            
        elif video_idt & video_idtit != set():
            video_ids = list((video_idt & video_idtit))                
            
        else: 
            video_ids = list(video_idtit)                
    else: 
            
        if video_idc & video_idt != set():
            video_ids = list((video_idc & video_idt))                
            
        elif video_idt !=set(): 
            video_ids = list(video_idt)
        
        else:
            video_ids = list(video_idc)
        
    
    #print ("video_ids:"+str(video_ids))
    video_sel = df.iloc[video_ids][df["rating"]!="None"]
    
    if video_sel.shape[0]>0: 
        video_id = video_sel.sort_values('rating',ascending=False).index[0]
    
    else:
        video_id =video_sel.index[0]
    
    
    return video_id      




In [None]:
#####################################################################################################################################
# FLASK WEBAPP


app = Flask(__name__ )

app.secret_key = os.getenv('SECRET_KEY', 'secret string')


@app.route('/',methods=['GET', 'POST'])

# Index page : shows a form and links inside de webapp
def index():
    form = Consult()
    if request.method == "POST":
        consult_text = form.consult.data
        level = form.level.data 
        video_id = search(consult_text,level)
        return redirect(url_for('params', video_id= video_id))
    return render_template('index.html', form=form)    

 
@app.route('/params')

# params page : shows the video selected, worcloud and related videos
def params():
    param = request.args.get("video_id", "None")
    dfy = pd.read_csv(path+"youtube_transcrip_vector.csv") 
    i= int(param)    
    file = 'Wordcloud/'+str(i)+'.png'    
    df = dfy.iloc[i]    
    
    # Get table of topics and centroids of the clusters
    topics = pickle.load(open(path_models + "topics.pickle", 'rb'))
    centroids = pickle.load(open(path_models + "centroids.pickle", 'rb'))
    
    # Calculate related videos
    df_topic = dfy[(dfy["topic"]==df.topic)&(dfy["rating"]!="None")].sort_values('rating',ascending=False)[["title", "link", "author","playlist_name","rating","level"]].drop_duplicates(subset ="playlist_name")
    df_cluster = dfy[(dfy["Cluster"]==df.Cluster)&(dfy["rating"]!="None")].sort_values('rating',ascending=False)[["title", "link", "author","playlist_name","rating","level"]].drop_duplicates(subset ="playlist_name")
    
    similars = docvectors = pickle.load(open(path_models + "doc_similars.pickle", 'rb'))  
    df_similars = dfy.iloc[list(similars[i][-7:-1][::-1])]
    
    table1=render_table(df_topic.head(6),"Selection in the same TOPIC")
    table2=render_table(df_cluster.head(6),"Selection in the same CLUSTER")
    table3=render_table(df_similars, "Selection similar CONTENT")
    
    return """
    <center><table class="default">
    <td><center><a href=\""""+ url_for('index')+""""\"><img src="""+url_for('static', filename="Arts/CookAI_monster_mini.png")+"""></center></td>
    <td><center><h1>TrikAI recommends you this special content...</h1></center></td>
    </table>
    <br>   
    <h2>Title: """+df.title+"""</h2>    
    <center><h3><b>Playlist:</b><a href="""+ "\"" + url_for('playlist', playlist_name= df.playlist_name)+"\""+"""">"""+df.playlist_name+"""</a><h3> </center>
    <table class="default">
    <tr>
    <th><center><h3><b>Author</b>:<a href= """ +"\""+ url_for('query',column='author',value =df.author)+"\""+"""">"""+df.author+"""</a></h3></center></th>
    <th><center><h3><b>Views</b>: """+str(df.views)+"""</h3></center></th>
    <th><center><h3><b>Rating:</b> """+str(df.rating)+"""</h3></center></th>    
    </tr>
    <tr>
    <td><center><h3>Level: """+str(df.level)+"""</h3></center></td>       
    <td><center><h3>Length: """+str(df.len)+"""</h3></center></td>
    <td><center><h3>Type: """+df.type+"""</h3></center></td>
    </tr>
    <tr>
    <td colspan="3"><center><h4>Topics: """+topics[int(df.topic)]+"""<h4></center></td>
    </tr> 
    <tr>
    <td colspan="3"><center><h4>Cluster: """+centroids[int(df.Cluster)]+"""<h4></center></td>
    </tr> 
    </table>
    <br>
    <center><table class="default"><td>
    <a href=\""""+ url_for('params', video_id = i-1)+""""\"><img src="""+url_for('static', filename="Arts/arrow_decrease.png")+"""></a>
    <iframe src="https://www.youtube.com/embed/"""+df[2].split("watch?v=")[1]+"""" width="853" height="480" frameborder="0" allowfullscreen></iframe>
    <a href=\""""+ url_for('params', video_id = i+1)+""""\"><img src="""+url_for('static', filename="Arts/arrow_increase.png")+"""></a></td>
    </table></center>
    <br>    
    <a href=\""""+ url_for('show', video_id = i)+""""\"><img src="""+url_for('static', filename=file)+"""></a></td>   
    <br>
    <br>
    <br>
    """ + table1+ """<br><br>"""+table2+ """<br><br>"""+table3


@app.route('/load_url')

# load url page : get the information of a YouTuble video list
def load_url():
    
    p = request.args.get("link","None") 
    dtype = str(request.args.get("dtype","None"))
    p_name, p_list = get_playlist(p)
    
    # model for the category
    nlp_cat = spacy.load(path_models+"nlp")
    
    videos = pd.DataFrame({"link":[], "transcript":[], "playlist_name":[],"playlist_link":[], "level":[], "type":[]})
    
    #print (p_name)
    # all the urls of the playlist
    for url in p_list:        
        try: 
            transcripcion = get_transcript(url)    
            level = predict_level(nlp_cat, [transcripcion,])
            #print (url)
            #print (level)            
            videos=videos.append({"link": url,"transcript": transcripcion, "playlist_name":p_name, "playlist_link":p, "level":level, "type":dtype} , ignore_index=True)
        except:
            print ("Problem!!!")
        
        sleep(randint(0,5))
        #print("\n")
        
    # manage the metadata
    metadata = pd.DataFrame({"link":[],"title":[], "rating":[], "views":[], "author":[]})
    
    # scrapping all the videos
    for i in range(0,videos.shape[0]):    
        meta = import_video_data(videos["link"].iloc[i])
        metadata=metadata.append({"link": meta[0],"title":meta[1], "rating":meta[2], "views":meta[3], "author":meta[4] } , ignore_index=True)
        if (i % 100 ==0)&(i !=0):
            #print (i)
            #print (meta)
            sleep(randint(10,15)) 
    
    # merge videos and metadata dataframes
    total = pd.merge(videos,metadata,left_on='link', right_on='link', how ='left')
    
    # Len column
    total["len"]= total["transcript"].apply(document_size)
    
    # Cluster column
    total["Cluster"]= total["transcript"].apply(cluster_keywords)   
    centroids = pickle.load(open(path_models + "centroids.pickle", 'rb'))
    
    # Topic column
    topics = pickle.load(open(path_models + "topics.pickle", 'rb'))
    total["topic"]= total["transcript"].apply(topic_identification)         
     
    # reindex datafrae    
    total = total.reindex(columns=['title','author','link','len','transcript','playlist_name','playlist_link','rating','views', 'type','level','vector','Cluster', 'topic'])
    
    # stores in temp directory   
    if os.path.exists(path_temp+"daiogenes.csv"):        
        daiogenes = pd.read_csv(path_temp+ "daiogenes.csv")    
        pd.concat([daiogenes, total], axis=0,ignore_index=True).to_csv(path_temp+"daiogenes.csv",index=False)        
    else:
        total.to_csv(path_temp+"daiogenes.csv", index=False)
    
    # render the output
    code = ""    
    for i in range(0, total.shape[0]):
         file = gen_wordcloud_temp(total.iloc[i]['transcript'],i)        
    
         code+= "<br><center>"+"<b>Title</b>:"+total.iloc[i]['title']+" |-----| "+"<b>Level</b>:"+total.iloc[i]['level']+"</center><br><center>"+"<b>Topic</b>:"+str(topics[total.iloc[i]['topic']])+" <br>"+"<b>Cluster</b>:"+str(centroids[total.iloc[i]['Cluster']])+"</center><br><center><iframe src=\"https://www.youtube.com/embed/"+total.iloc[i]["link"].split("watch?v=")[1]+"\""+"width=\"560\" height=\"315\" frameborder=\"0\" allowfullscreen></iframe></center><br><img src="+url_for('static', filename=file)+"><br><br>"   
         
    return """
    <center><table class="default">
    <td><center><a href=\""""+ url_for('index')+""""\"><img src="""+url_for('static', filename="Arts/CookAI_monster_mini.png")+"""></center></td>
    <td><center><h1>Playlist Digested !</h1></center></td>
    </table>
    <br>
    <br><h2>"""+p_name+"</h2>"+str(code)  
    """
    </center>
    """

    
@app.route('/library')

# library page : shows relevant information of the dataframe
def library():
    
    df = pd.read_csv("youtube_transcrip_vector.csv")
    
    # total num of videos and total words
    num_videos = df.shape[0]
    total_length = df['len'].sum()
    
    # histogram of lengths
    fig, ax = plt.subplots(figsize=(5, 5), dpi=70)
    sns.histplot(data=df['len'])
    fig.savefig(path_figures +'len_fig.png', dpi =70)
    file_len = 'Figures/len_fig.png'   
    
    # histogram of levels
    fig, ax = plt.subplots(figsize=(5, 5), dpi=70)
    sns.histplot(data=df['level'])
    fig.savefig(path_figures +'level_fig.png', dpi =70)
    file_level = 'Figures/level_fig.png' 
    
    # histogram of types
    fig, ax = plt.subplots(figsize=(5, 5), dpi=70)
    sns.histplot(data=df['type'])
    fig.savefig(path_figures +'type_fig.png', dpi =70)
    file_type = 'Figures/type_fig.png' 
    
    # topic table calculation
    counts =df["topic"].value_counts()
    data = pd.DataFrame({"index":counts.index, "counts":counts.values})
    topics = pickle.load(open(path_models + "topics.pickle", 'rb'))
    data["topic"]=data["index"].astype(int).apply(lambda x: topics[x])
    
    # cluster table calculations
    cluster_counts =df["Cluster"].value_counts()
    data_c = pd.DataFrame({"index":cluster_counts.index, "counts":cluster_counts.values})
    centroids = pickle.load(open(path_models + "centroids.pickle", 'rb'))
    data_c["Cluster"]=data_c["index"].astype(int).apply(lambda x: centroids[x])
    
    
    return head("TrikAI library")+"""    
    <center><h2>Total Videos: """+str(num_videos)+"""</h2></center>
    <center><h2>Total Length: """+str(int(total_length))+"""</h2></center>
    <br>
    <br>
    <br>
    <table class="default">
    <tr>
    <th><center><h2>Length</h2></center></th>
    <th><center><h2>Level</h2></center></th>
    <th><center><h2>Type</h2></center></th>
    </tr>
    <tr>
    <td><center><img src="""+url_for('static', filename=file_len)+"""></center></td> 
    <td><center><img src="""+url_for('static', filename=file_level)+"""></center></td>      
    <td><center><img src="""+url_for('static', filename=file_type)+"""></center></td>  
    </table>  
    <br>
    """ + render_table_generic_links(data_c,"Clusters and number of videos", cols=['Cluster','index'])+"""<br><br>"""+ render_table_generic_links(data,"Topics and number of videos", cols=['topic','index'])


@app.route('/dAIogenes', methods=['GET', 'POST'])

# dAIogenes page : form to get valid url playlist
def dAIogenes():
    form = Video_playlistForm()
    
    if request.method == "POST":
        #print (form.url.data)
        return redirect(url_for('load_url', link = form.url.data, dtype = form.playlist_type.data))    
      
    return render_template('url.html', form=form)


@app.route('/cluster')

# cluster page : to calculate the clusters from the transcripts
def cluster():   

    def cluster_document(doc):
        Y = vectorizer.transform([doc])
        prediction = model.predict(Y)
        return prediction[0]

    documents =[]
    df = pd.read_csv("youtube_transcrip_vector.csv")
    
    for i in range(0,df.shape[0]):
       documents.append(df.iloc[i]['transcript'])

    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(documents)

    # Manual number of clusters !!!!
    # --------------------------------------------
    true_k = 30
    # --------------------------------------------

    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=200, n_init=1)
    model.fit(X)
   
    df['Cluster']=df['transcript'].apply(cluster_document)
    
    df.to_csv("youtube_transcrip_vector.csv",index=False)
    
    centroids = cluster_centroids(model, vectorizer)
    
    
    # save the model 
    #print ("save")
    pickle.dump(centroids, open(path_models+"centroids.pickle", 'wb'))
    pickle.dump(model, open(path_models+"cluster.pickle", 'wb'))
    pickle.dump(vectorizer, open(path_models+"tfidf.pickle", "wb"))
    
    # cluster summary    
    cluster_counts =df["Cluster"].value_counts()
    data_c = pd.DataFrame({"index":cluster_counts.index, "counts":cluster_counts.values})
    centroids = pickle.load(open(path_models + "centroids.pickle", 'rb'))
    data_c["Cluster"]=data_c["index"].apply(lambda x: centroids[x])
    
    return head("Cluster Generation done! ") + render_table_generic_links(data_c,"Clusters", cols=['Cluster','index'])


@app.route('/playlist')

# playlist page: to show a playlist given as argument
def playlist():  
    playlist_name = request.args.get("playlist_name", "None")
    df = pd.read_csv(path+"youtube_transcrip_vector.csv")    
    df_playlist = df[df["playlist_name"]==playlist_name].sort_index(ascending=True)[["title", "link", "author","rating","level"]]
        
    table=render_table(df_playlist)         

    return  head("Playlist details:") + table


@app.route('/query')

# query page: to show a table for the query given as argument
def query():
    query = request.args.to_dict()
    #print (query)
    column= query["column"]
    value = query["value"]
    if value.isnumeric():
        value = int(value)
    df = pd.read_csv(path+"youtube_transcrip_vector.csv")    
    
    df_query = df[df[column]==value].sort_index(ascending=True)[["title", "link", "author","rating","level"]]
        
    table=render_table(df_query, str(column)+" with value: "+str(value))   
    
    return head("TrikAI query") + table 


@app.route('/topic')

# topic page: to calculate the topics from the transcripts
def topic():  
    
    def topic_document(text):
        prediction = tm.predict([text]).argmax()        
        return prediction    
      
    df = pd.read_csv("youtube_transcrip_vector.csv")
    
    text = df["transcript"].to_list()
    
    tm = ktrain.text.get_topic_model(text, n_features =1000)   
 
    tm.build (text, threshold = 0.25)
    
    tm.print_topics(show_counts = True)
    
    tm.save(path_models +"topic")
    
    df['topic']=df['transcript'].apply(topic_document)
    
    df.to_csv("youtube_transcrip_vector.csv",index=False)
    
    pickle.dump(tm.topics, open(path_models + "topics.pickle",'wb'))
    
    counts =df["topic"].value_counts()
    data = pd.DataFrame({"index":counts.index, "counts":counts.values})
    topics = pickle.load(open(path_models + "topics.pickle", 'rb'))
    data["topic"]=data["index"].apply(lambda x: topics[x])           

    return head("Topic Generation done!") + render_table_generic_links(data,"Topics", cols=['topic','index'])


@app.route('/update')

# update page: to include the temporal dataset (daiogenes.csv) in the general dataset (youtube_transcrip_vector.csv), including cluster, topic and vector calculations
def update():  
    
    if os.path.exists(path_temp+"daiogenes.csv"):    
        df = pd.read_csv(path_temp+"daiogenes.csv")
        df['vector'] = df['transcript'].apply(doc_to_vect)
        df1 = pd.read_csv(path+"youtube_transcrip_vector.csv")
        last = df1.shape[0]    
        df3 = pd.concat([df1, df], axis=0,ignore_index=True)
        df3.drop_duplicates(subset=['link'], keep='first', inplace=True, ignore_index=True)
        df3.to_csv(path+"youtube_transcrip_vector_backup.csv", index=False)
        df3.to_csv(path+"youtube_transcrip_vector.csv", index=False)
        df3_last = df3.shape[0]
        tempdocvectors = []
        for i in range(last, df3_last):
            temp_transcript = df3.iloc[i]['transcript']
            gen_wordcloud(temp_transcript, i)
            tempdocvectors.append(doc_to_vect(temp_transcript))
        
        docvectors = pickle.load(open(path_models + "docvectors.pickle", 'rb'))
        docvectors = docvectors+tempdocvectors
        pickle.dump(docvectors, open(path_models + "docvectors.pickle",'wb'))
        
        similars = cosine_similarity(docvectors,docvectors).argsort()[:,-10:]
        pickle.dump(similars, open(path_models + "doc_similars.pickle",'wb'))
            
        os.remove(path_temp+"daiogenes.csv")
    
    else:
        return """
        <center><h1>ERROR!</h1> """
       

    return redirect(url_for('library'))


@app.route('/show')

# show page: to show a transcription
def show():  

    show = request.args.get("video_id", "None")
    dfs= pd.read_csv(path+"youtube_transcrip_vector.csv") 
    i= int(show)
    text = dfs.iloc[i]['transcript'].replace("\n","<br><br>") 
    
    return head("Transcription of: "+str(dfs.iloc[i]['title'])) +"<br><br><br><table Width=\"50%\"><tr>"+ text+"</tr></table>"



@app.route('/admin')

# admin page: to access admin menu (Cluster generation, topic generation and Update Database)
def admin():
    return """
    <center><table class="default">
    <td><center><a href=\""""+ url_for('index')+""""\"><img src="""+url_for('static', filename="Arts/CookAI_monster_mini.png")+"""></center></td>
    <td><center><h1>TrikAI Administration Panel</h1></center></td>
    </table>
    <br>   
    
    <table class="default">
    <tr>    
    <th><center><h3><b>1.-</b><a href= """ +"\""+ url_for('cluster')+"\""+"""">Cluster Generation</h3></center></th>
    </tr>
    <tr>
    <th><center><h3><b>2.-</b><a href= """ +"\""+ url_for('topic')+"\""+"""">Topic Generation</h3></center></th>
    </tr>
    <tr>
    <th><center><h3><b>3.-</b><a href= """ +"\""+ url_for('update')+"\""+"""">Update Database</h3></center></th>
    </tr>
    </table>"""
   


if __name__=="__main__":
#debug = True
    from waitress import serve
    serve(app, host="0.0.0.0", port=8081)
  
   

In [5]:
%who

Consult	 Flask	 KMeans	 Playlist	 STOPWORDS	 TfidfVectorizer	 Video_playlistForm	 WordCloud	 YouTubeTranscriptApi	 
adjusted_rand_score	 centroids	 cluster_centroids	 cluster_keywords	 code	 counter	 daiogenes	 doc_to_vect	 document_size	 
dtype	 file	 flash	 gen_wordcloud	 gen_wordcloud_temp	 get_playlist	 get_transcript	 head	 i	 
import_video_data	 ktrain	 level	 load_topic_model	 meta	 metadata	 nlp	 nlp_cat	 os	 
p	 p_list	 p_name	 path	 path_Wordcloud	 path_figures	 path_models	 path_temp	 pd	 
pickle	 plt	 predict_level	 randint	 redirect	 render_table	 render_table_generic	 render_table_generic_links	 render_template	 
request	 requests	 search	 sleep	 sns	 spacy	 topic_identification	 topics	 total	 
transcripcion	 url	 url_for	 videos	 visualize_documents	 
