### Imports

In [65]:
from utils_art import *

from sklearn.datasets import make_classification
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import Isomap
from sklearn.neighbors import KNeighborsTransformer
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_regression

import pandas as pd
import numpy as np
from textblob import TextBlob
import tempfile
# np.random.seed(0)

from IPython.core.display import Image as image
from PIL import Image
from IPython.display import Image
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.cm as cm

PLOT_WIDTH=1800
PLOT_HEIGHT=1000
PLOT_AUTOSIZE=True
PCA_TYPE_LIST = ["auto", "full", "arpack", "randomized"]
PLOT_RENDER_LIST = ["png","browser","svg"]
PLOT_RENDER = PLOT_RENDER_LIST[1]
TAG_SELECTION_LIST = ["NNP","NN","JJ"]

### Data transformation 

In [66]:
def dfNormalize(df) :
    return ((df - df.mean()) / df.std())

def generateTestSet() :
    X, y = make_classification(
        n_features=24,
        n_classes=5,
        n_samples=1500,
        n_informative=3,
        random_state=5,
        n_clusters_per_class=1,
    )
    return X, y

## Dimention reduction

def dimReduction_TSNE(df, n_components=2, perplexity=30, early_exaggeration =4.0,learning_rate=1000,n_iter=1000,verbose=0,random_state=0,norm_output=False):
    tsne = TSNE(n_components=n_components, perplexity=perplexity,early_exaggeration = early_exaggeration,learning_rate =learning_rate,n_iter=n_iter,verbose=verbose,random_state=random_state)
    out_df = tsne.fit_transform(df)
    out_df = pd.DataFrame(out_df)
    if norm_output :
        out_df = dfNormalize(out_df)
    return out_df

def dimReduction_PCA(df, n_components=2, svd_solver="auto",tol=0.0,whiten=False,random_state=0,norm_output=False):
    pca = PCA(n_components=n_components,svd_solver=svd_solver,tol=tol,random_state=random_state)
    out_df = pca.fit_transform(df)
    out_df = pd.DataFrame(out_df)
    if norm_output :
        out_df = dfNormalize(out_df)
    return out_df

def dimReduction_IPCA(df, n_components=2,whiten=False, batch_size=100,norm_output=False):
    ipca = IncrementalPCA(n_components=n_components,batch_size=batch_size)
    out_df = ipca.fit_transform(df)
    out_df = pd.DataFrame(out_df)
    if norm_output :
        out_df = dfNormalize(out_df)
    return out_df

def dimReduction_NNT(df,n_components=2,n_neighbors=5,mode='distance',algorithm='auto',leaf_size=30,p=2,eigen_solver='auto',tol=0.0,metric='minkowski',n_jobs=None,norm_output=False):
    #KTN mode : 'distance' 'connectivity'
    #KTN algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
    #BOTH n_jobs : None -1
    #BOTH metric = ['minkowski',  'manhattan','cityblock','l1',  'euclidean','l2',  'cosine',  'haversine',  'nan_euclidean'] 'precomputed' ?
    #ISO eigen_solver : ['auto', 'arpack', 'dense']
    cache_path = tempfile.gettempdir()
    knt = KNeighborsTransformer(mode=mode,n_neighbors=n_neighbors,algorithm=algorithm,leaf_size=leaf_size,metric=metric,p=p,n_jobs=n_jobs)
    iso = Isomap(n_components=n_components,n_neighbors=n_neighbors,eigen_solver=eigen_solver,metric=metric,tol=tol,p=p,n_jobs=n_jobs)# 
    nnt = make_pipeline(knt,iso,memory=cache_path)
    out_df = nnt.fit_transform(df)
    out_df = pd.DataFrame(out_df)
    if norm_output :
        out_df = dfNormalize(out_df)
    return out_df

##
def testDimReduct(df,i) :
    # df = dfNormalize(df)
    df_tsne = dimReduction_TSNE(df)
    plot2Dpandas(df_tsne,title="TSNE (T-distributed Stochastic Neighbor Embedding)",save=True,path=folder_path_graph+"TSNE_",savecount=i)
    
    df_pca = dimReduction_PCA(df)
    plot2Dpandas(df_pca,title="PCA (Principal Component Analysis)",save=True,path=folder_path_graph+"PCA_",savecount=i)
    
    df_ipca = dimReduction_IPCA(df)
    plot2Dpandas(df_ipca,title="IPCA (Incremental Principal Component Analysis)",save=True,path=folder_path_graph+"IPCA_",savecount=i)
    
    df_nnt = dimReduction_NNT(df)
    plot2Dpandas(df_nnt,title="NNT (Nearest Neighbors Transformer)",save=True,path=folder_path_graph+"IPCA_",savecount=i)

### Visualization Functions

In [67]:
## Visualization using "plotly"

def renderAllOptions(df,BD=True,confList=[],label="") :
    for conf in confList :
        # if label != "" :
        #     conf['title'] = conf['title'] + "  -  " + label
        if (BD) :
            plot2D(df,createdefconfdict()|conf)
        else :
            plot3D(df,createdefconfdict()|conf)
            


def plot2D(df,conf_dict={}) :
    fig = px.scatter(df, x=conf_dict["x"], y=conf_dict["y"], color=conf_dict["c"],size=conf_dict["size"])
    return fig
                        
def plot3D(df,conf_dict={}) :
    fig = px.scatter_3d(df, x=conf_dict["x"], y=conf_dict["y"], z=conf_dict["z"], color=conf_dict["c"],size=conf_dict["size"])


# ["x","y","z","c","size","symbol","text","h_name","h_data","c_data","title","xtitle","ytitle"]
def createdefconfdict() :
    ret_dict = {}
    val_list = ["x","y","z","c","size","symbol","h_name","h_data","c_data","text","facet_r","facet_c","facet_cw","facet_rs","facet_cs","title","xtitle","ytitle","browser","animation_frame","animation_group","marginal_x","marginal_y","trendline","log_x","log_y","log_z","render_mode","size_max","opacity"]
    for val in val_list :
        ret_dict[val] = None
    return ret_dict

## Visualization using "pandas"

def plot2Dpandas(df,size=3,color="#a98d19",width=15,height=15,title="default",xl="x",yl="y",save=False,path="",savecount=0) :
    res = df.plot.scatter(x = 0, y = 1, s=size, c="o_token_input",figsize=(width,height),title=title,xlabel=xl,ylabel=yl).get_figure(); 
    if save :
        res.savefig(path+"fig_"+str(savecount)+".png")
        
## Visualization using "matplotlib"

def plot2Dmatplotlib(df) :
    fig, ax = plt.subplots(figsize=(10, 10))
    ax.scatter(df[0], df[1], 0.5,"#a98d19")#, c=df['color']
    plt.show()
   

### Variables

In [68]:
# folder_path="C:/Users/User/OneDrive/Desktop/article/file_2/join_2_df/"
# # folder_path="C:/Users/User/OneDrive/Desktop/article/file_2/test_llm_output/run2/"
# folder_path_embd="C:/Users/User/OneDrive/Desktop/article/file_2/join_2_df/embd/only_embd/"
# folder_path_keyword="C:/Users/User/OneDrive/Desktop/article/file_2/join_2_df/keyword/"
# folder_path_llm="C:/Users/User/OneDrive/Desktop/article/file_2/join_2_df/llm/"
# folder_path_graph="C:/Users/User/OneDrive/Desktop/article/file_2/join_2_df/graphs/"
# folder_path_viz="C:/Users/User/OneDrive/Desktop/article/file_2/join_2_df/viz/"

# filename="article_stats_embedding"
# filename_embd="embedding_matrix_1000"#"embedding_matrix_main"#"article_stats_embedding_main" #""
# filename_keyw="article_keyword_main"
# open_path_keyw="C:/Users/User/OneDrive/Desktop/article/file_2/join_1_df/"

folder_path_input_df="C:/Users/User/OneDrive/Desktop/article/files_3/3_1_join_main/arc/"
folder_path_input_embd="C:/Users/User/OneDrive/Desktop/article/files_3/2_1_embdedding_main/embd_raw/"
filename_input_df="keyword_with_nlp"
filename_input_embd="embedding_matrix_1000"

folder_path_graph="C:/Users/User/OneDrive/Desktop/article/files_3/3_2_visu_main/"


In [69]:
df_main = openDFcsv(folder_path_input_df,filename_input_df)
mat_emb = openDFcsv(folder_path_input_embd,filename_input_embd)
display(df_main)
display(mat_emb)

Unnamed: 0.1,Unnamed: 0,hash_key,index,title_quer,title_par,published,year,year_month,source_url,url_list,...,tb.polaj,tb.pos,tb.neg,vs.pos,vs.neu,vs.neg,vs.comp,ts.neg,ts.pos,published_date_type
0,0,ff0c32ec62d68ed0fc1b783efd0c7d5839e48b88,0,Top enlisted Airman addresses key Cannon issue...,Top enlisted Airman addresses key Cannon issue...,1/26/2010,2010,2010-01,https://www.af.mil,['af'],...,0.500000,0.998927,0.001073,0.000,1.000,0.000,0.0000,0.153918,0.918740,2010-01-26
1,1,cec0db5c203d47d86e7ab53214cc4e00e275f7fe,1,'Top Chef' secrets revealed: From the judges' ...,'Top Chef' secrets revealed: From the judges' ...,1/12/2010,2010,2010-01,https://ew.com,['ew'],...,0.500000,0.996438,0.003562,0.141,0.859,0.000,0.4215,0.153775,0.903087,2010-01-12
2,2,ed439993352c671f6cfe0df72613ea80daa45909,2,Top of the Pops - The New Yorker,Top of the Pops,1/3/2010,2010,2010-01,https://www.newyorker.com,['newyorker'],...,0.500000,0.842701,0.157299,0.000,1.000,0.000,0.0000,0.128399,0.864715,2010-01-03
3,3,af722aad3f92dcb13b1cce4e42bd53df2f06838a,3,Why top predators matter: an in-depth look at ...,Why top predators matter: an in-depth look at ...,2/2/2010,2010,2010-02,https://news.mongabay.com,"['news', 'mongabay']",...,0.500000,0.062267,0.937733,0.115,0.885,0.000,0.0516,0.317805,0.859738,2010-02-02
4,4,ec4cbc1723258e2e53368c3956b6fa56a9f4350b,4,White Wizzard - Over the Top Review - Angry Me...,White Wizzard – Over the Top Review,2/16/2010,2010,2010-02,https://www.angrymetalguy.com,['angrymetalguy'],...,0.500000,0.863703,0.136297,0.151,0.677,0.172,-0.1027,0.224570,0.750368,2010-02-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51365,51365,af17b14fd7de3de91675213713954bc71f9bd740,51365,"Gagnidze ’20, Stickel ’19 Receive Top Student ...","Gagnidze ’20, Stickel ’19 Receive Top Student ...",5/13/2019,2019,2019-05,https://www.hamilton.edu,['hamilton'],...,0.500000,0.767071,0.232929,0.444,0.556,0.000,0.9001,0.075608,0.873769,2019-05-13
51366,51366,72376ddec98bbef66d9af591c7c2df8d15b12ea2,51366,"WVU Today | WVU robotics team on top again, wi...","WVU robotics team on top again, wins second NA...",6/7/2019,2019,2019-06,https://wvutoday.wvu.edu,"['wvutoday', 'wvu']",...,0.550000,0.246898,0.753102,0.179,0.821,0.000,0.5719,0.038150,0.838866,2019-06-07
51367,51367,0c2c54803070e32a1948b980d618f1859c736ad8,51367,The top 10 institutions for Earth and environm...,The top 10 institutions for Earth and environm...,6/20/2019,2019,2019-06,https://www.nature.com,['nature'],...,0.500000,0.675161,0.324839,0.000,1.000,0.000,0.0000,0.184901,0.883798,2019-06-20
51368,51368,ed95ca885145f52b6a3947fc49a222a3ce51817a,51368,Top Dealer in Han Gil Case Pleads Guilty - DEA,Top Dealer in Han Gil Case Pleads Guilty,6/20/2019,2019,2019-06,https://www.dea.gov,['dea'],...,0.266667,0.836229,0.163771,0.124,0.461,0.415,-0.7430,0.643589,0.712420,2019-06-20


Unnamed: 0,hash_key,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,ff0c32ec62d68ed0fc1b783efd0c7d5839e48b88,0.030684,0.034969,0.271537,0.110484,0.063989,0.143984,-0.012120,0.058677,0.022026,...,0.068486,0.166010,-0.164735,-0.105527,-0.267146,-0.211762,-0.147596,-0.078543,0.005476,0.240233
1,cec0db5c203d47d86e7ab53214cc4e00e275f7fe,0.010978,0.032475,-0.235852,-0.036911,0.266174,-0.069472,0.014558,0.217074,-0.081101,...,-0.035382,0.246189,0.061246,-0.135326,0.126368,-0.053235,-0.056077,-0.107503,-0.022504,-0.045741
2,ed439993352c671f6cfe0df72613ea80daa45909,-0.065070,0.042591,-0.382171,0.132250,0.179446,-0.073735,0.045661,0.014125,-0.103537,...,0.031256,-0.031384,0.131483,-0.032119,0.246595,-0.100467,0.057780,-0.082561,0.137494,0.086845
3,af722aad3f92dcb13b1cce4e42bd53df2f06838a,0.178800,0.073617,0.200197,0.341076,0.020205,0.259938,-0.142680,0.326670,0.014962,...,0.198502,-0.108996,0.043217,0.107301,0.063237,0.014445,-0.083892,-0.174775,0.056881,-0.044859
4,ec4cbc1723258e2e53368c3956b6fa56a9f4350b,0.106947,0.266411,-0.219826,-0.067035,0.173365,-0.075808,0.045874,0.037286,-0.032189,...,0.208828,0.031293,0.371443,-0.063884,-0.092799,0.231688,0.039727,0.035742,-0.014620,-0.155942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,b76095ab03244b1ab7ec8299dab5ccf62200353e,0.099314,0.065523,-0.154862,0.056795,0.204612,-0.053585,-0.098316,-0.057575,0.113216,...,-0.001252,-0.174563,0.250622,0.007707,-0.106421,-0.152493,-0.068266,-0.150872,-0.005443,-0.034819
996,799cba3b3038a5d0c0f9c8ec4ad6285b4bbd13ee,0.113369,-0.172311,-0.254562,0.123803,0.113247,-0.048996,-0.115077,0.199647,0.203674,...,0.038776,0.023491,0.114284,-0.064129,0.151321,0.002702,-0.232107,-0.083044,0.108000,0.051254
997,2e38ef0d35b7208ca154e6471fca6844b89dff13,0.282681,-0.145481,-0.013016,0.156001,0.286486,-0.232770,-0.009428,-0.027711,-0.163946,...,-0.153651,-0.362137,0.091989,0.091989,-0.049072,0.028173,-0.068880,0.017570,-0.032454,0.130486
998,2f8686bfb6fc22e3444efd30cf21b0114132c31f,0.124003,-0.126939,-0.211398,0.316348,0.047290,0.008551,-0.029064,0.190283,-0.013704,...,-0.222018,-0.004689,-0.046728,0.035639,0.069029,-0.123441,-0.221394,0.290110,-0.040918,0.063969


In [70]:
x = None
y = None
z = None
marginal_list = [None,'rug', 'box', 'violin','histogram']
trendline_list = [None,'ols', 'lowess', 'rolling', 'expanding','ewm']
render_mode_list =['auto', 'svg','webgl']
category = "category"
size = "words"#"text_len"  #"text_len"#"words"  #"text_len"#
symbol = None#  "" #None #  "url_TLD"
h_name = "source_title"  #"title_quer" #None #
h_data = []#"title_quer","subjectivity","polarity","pos1","neg1","0_s_k","0_s_t"] # None# "title_quer"#None #"source_title"
c_data = None  #"word_combined_s_k"
browser = True 
facet_r = None #"year_month" #"category"
facet_c = None
facet_rs =0.03
facet_cs =0.03
animation_frame = None#"year"#"year_month"
animation_group = None#"hash_key"
marginal_x = marginal_list[0]
marginal_y = marginal_list[0]
trendline = trendline_list[0]
log_x =False
log_y = False
log_z = False
render_mode = render_mode_list[1]
size_max = 30
opacity=0.9

_2d_embd = {"x":0,"y":1,'c':category,'size':size,'title':"2d_embd"} #,
_2d_embdX = {"x":0,"y":1,'c':category,'size':size,'symbol':symbol, "h_name":h_name,"h_data":h_data,"c_data":c_data,"text":None, 'title':"2d_embd",
        "facet_r":facet_r,"facet_c":facet_c,"facet_cw":None,'facet_rs':facet_rs,'facet_cs':facet_cs,"browser":browser,
           "animation_frame":animation_frame,"animation_group":animation_group,"marginal_x":marginal_x,"marginal_y":marginal_y,
           "trendline":trendline,"log_x":log_x,"log_y":log_y,"render_mode":render_mode,"size_max":size_max,"opacity":opacity,
           'title':"2d_embd","xtitle":None,"ytitle":None}
          
_3d_embd = {"x":0,"y":1,"z":2,'c':category,'size':size,'title':"3d_embd"}
_3d_embdX = {"x":0,"y":1,"z":2,'c':category,'size':size,'symbol':symbol,
            "h_name":h_name,"h_data":h_data,"c_data":c_data,"text":None,"browser":browser,
            "animation_frame":animation_frame,"animation_group":animation_group,"size_max":size_max,"opacity":opacity,
            "log_x":log_x,"log_y":log_y,"log_z":log_z,
            'title':"3d_embd","xtitle":None,"ytitle":None}

# _3d_pos = {"x":"tb.polaj","y":"tb.sub","z":"ts.pos",'c':category,'size':size,'symbol':symbol,
#             "h_name":h_name,"h_data":h_data,"c_data":c_data,"text":None,"browser":browser,
#             "animation_frame":animation_frame,"animation_group":animation_group,"size_max":size_max,"opacity":opacity,
#             "log_x":log_x,"log_y":log_y,"log_z":log_z,
#             'title':"3d_embd","xtitle":None,"ytitle":None}

#{"x":"tb.polaj","y":"tb.sub",'c':category,'size':size,'symbol':symbol,
#{"x":"tb.polaj","y":"tb.sub","z":"ts.pos",'c':category,'size':size,'symbol':symbol,

# _3d_embd_nlp = {"x":"tb.sub","y":"tb.polaj",'z':"ts.pos",'c':"category",'size':size,'title':"3d_embd","h_data":h_data,"size_max":size_max,"opacity":opacity,"browser":browser}
# _3d_nlp = {"x":"polarity","y":"subjectivity",'z':"compound",'c':"category",'size':size,'symbol':symbol,'title':"3d_nlp"}
# _3d_nlp_sent = {"x":"pos1","y":"neu1",'z':"neg1",'c':"category",'size':size,'symbol':symbol,'title':"3d_nlp_sent"}
# _3d_len = {"x":"words","y":"noun_phrases",'z':"sentences",'c':"category",'size':size,'symbol':symbol,'title':"3d_len"}

_2dList = [_2d_embdX]
_3dList = [_3d_embdX]

##### 

In [71]:
def generateDirRed(mat_emb,df_main,n_components=2,norm_output=False,active_sel=[True,False,False,False]) :
    out_list = []
    out_list_label = []
    if active_sel[0] :
        df_tsne = dimReduction_TSNE(mat_emb,n_components,norm_output=norm_output)
        df_tsne_j = df_main.join(df_tsne, how="inner")
        out_list.append(df_tsne_j)
        out_list_label.append("TSNE")
    if active_sel[1] :
        df_pca = dimReduction_PCA(mat_emb,n_components,norm_output=norm_output)
        df_pca_j = df_main.join(df_pca, how="inner")
        out_list.append(df_pca_j)
        out_list_label.append("PCA")
    if active_sel[2] :
        df_ipca = dimReduction_IPCA(mat_emb,n_components,norm_output=norm_output)
        df_ipca_j = df_main.join(df_ipca, how="inner")
        out_list.append(df_ipca_j)
        out_list_label.append("IPCA")
    if active_sel[3] :
        df_nnt = dimReduction_NNT(mat_emb,n_components,norm_output=norm_output)
        df_nnt_j = df_main.join(df_nnt, how="inner")
        out_list.append(df_nnt_j)
        out_list_label.append("NNT")

    return out_list, out_list_label
    


df_main = openDFcsv(folder_path_input_df,filename_input_df)
mat_emb = openDFcsv(folder_path_input_embd,filename_input_embd) #3000/5000/30000/main
# df_main.drop(["Unnamed: 0"], axis=1,inplace=True)
mat_emb.drop(["hash_key"], axis=1,inplace=True)
# df_main = df_main.loc[(df_main['category'] == "WORLD") | (df_main['category'] == "NATION") | (df_main['category'] == "TECHNOLOGY")]
models_list, label_list = generateDirRed(mat_emb,df_main,3,False)
for i in range(len(models_list)) :
    renderAllOptions(models_list[i],True,_2dList,label_list[i])  #out_list_label[i])
    renderAllOptions(models_list[i],False,_3dList,label_list[i])

### Keyword functions

### Keyword Workflow

## Test

### Display Test

In [None]:
df_open = openDFcsv(folder_path_embd, filename_embd)
df_open.set_index("hash_key", inplace=True) # index

# np_data = modelDf(df_open,True)
for perplexity in [5,20,50] :
    for early_exaggeration in [4.0] :#[2.0,4.0,6.0] :
        for learning_rate in [700] : # [200,600,1000] :
            for n_iter in [400,1000,2500] :
                print("perplexity",perplexity)
                print("early_exaggeration",early_exaggeration)
                print("learning_rate",learning_rate)
                print("n_iter",n_iter)
                np_data = TSNE(n_components=2,perplexity = perplexity,early_exaggeration=early_exaggeration,learning_rate=learning_rate,n_iter=n_iter, random_state=0).fit_transform(df_open)
                plot2Dminimal(np_data)

In [None]:

hash_key
index
title_quer
title_par
published
year
year_month
source_url
url_list
url_TLD
source_title
category
authors
keywords_list
text_len
sentences
noun_phrases
words
polarity
subjectivity
pos2
neg2
neg1
neu1
pos1
compound
valid
link
pk
word_count_f_t
word_combined_f_t
word_count_f_k
word_combined_f_k
word_count_s_t
word_combined_s_t
word_count_s_k
word_combined_s_k
0_s_k
0_s_t
word_combined_all
word_count_all
tb.pol
tb.sub
tb.polaj
tb.pos
tb.neg
vs.pos
vs.neu
vs.neg
vs.comp
ts.neg
ts.pos
published_date_type


In [None]:
import plotly.express as px
df = px.data.iris()
fig = px.scatter_3d(df, x='sepal_length', y='sepal_width', z='petal_width',
              color='petal_length', size='petal_length', size_max=18,
              symbol='species', opacity=0.7)

# tight layout
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()
hover_name 
hover_data 
custom_data 

print(X)
print(type(X))
print(X.shape)
print(y)
print(type(y))
print(y.shape)
fig = px.scatter_3d(x=X[:, 0], y=X[:, 1], z=X[:, 2], color=y, opacity=0.8)
fig.show()
fig = px.scatter_3d(x=X[:, 4], y=X[:, 5], z=X[:, 6], color=y, opacity=0.8)
fig.show()

import numpy as np
a = np.array([1.3, 2.3, 3.3, 4.3])
np.savetxt('test1.txt', a, fmt='%f')
b = np.loadtxt('test1.txt', dtype=float)
a == b


fig = px.scatter(x=X_pca[:, 0], y=X_pca[:, 1], color=y)
fig.update_layout(
    title="PCA visualization of Custom Classification dataset",
    xaxis_title="First PCA",
    yaxis_title="Second PCA",
)
fig.show()


folder_path="C:/Users/User/OneDrive/Desktop/article/file_2/.bin/"
filename="viz_test3"
df=openDFcsv(folder_path,filename)
# display(df)
# df = px.data.iris()

d = {"x":'Polarity_per_count',"y":"Subjectivity_per_count",'z':"Positivity_per_count",'c':"category",'size':"count",'symbol':"category"}
d = {"x":'polarity',"y":"subjectivity",'z':"pos2",'c':"category",'size':"words",'symbol':"url_TLD",'text':"source_title"}
fig = px.scatter_3d(df, x=d["x"], y=d["y"], z=d["z"], color=d["c"],size=d["size"],symbol=d["symbol"],text=d["text"], size_max=50, opacity=0.9)
# fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
# fig.update_layout(autosize=False, width=2000)
fig.update_layout(autosize=True) # remove height=800
fig.show(renderer="browser")  # remove display(fig)
fig.show()


def plotTSNE(data,n_components=2,perplexity=3,random_state=10):
    tsne = TSNE(n_components=n_components,perplexity=perplexity,random_state=random_state) # , random_state=100
    X_tsne = tsne.fit_transform(data)
    print(tsne.kl_divergence_)
    fig = px.scatter(x=X_tsne[:, 0], y=X_tsne[:, 1]) #, color=np.array(range(69))
    fig.update_layout(
        title="t-SNE visualization of Custom Classification dataset",
        xaxis_title="First t-SNE",
        yaxis_title="Second t-SNE",
    )
    fig.show()
    
    
import plotly.express as px
from sklearn.datasets import make_classification

X, y = make_classification(
    n_features=6,
    n_classes=3,
    n_samples=1500,
    n_informative=2,
    random_state=5,
    n_clusters_per_class=1,
)


fig = px.scatter_3d(x=X[:, 0], y=X[:, 1], z=X[:, 2], color=y, opacity=0.8)
fig.show()





import plotly.express as px
df = px.data.iris()
fig = px.scatter_3d(df, x='sepal_length', y='sepal_width', z='petal_width',
                    color='petal_length', symbol='species')
fig.show()

import plotly.express as px
df = px.data.iris()
fig = px.scatter_3d(df, x='sepal_length', y='sepal_width', z='petal_width',
              color='petal_length', size='petal_length', size_max=18,
              symbol='species', opacity=0.7)

# tight layout
fig.update_layout(margin=dict(l=15, r=15, b=15, t=15))
fig.show()



import plotly.graph_objects as go
import numpy as np

# Helix equation
t = np.linspace(0, 10, 50)
x, y, z = np.cos(t), np.sin(t), t

fig = go.Figure(data=[go.Scatter3d(x=x, y=y, z=z,
                                   mode='markers')])
fig.update_layout(autosize=True) # remove height=800
fig.show(renderer="browser")  # remove display(fig)
fig.show()

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

display(df)
df=(df - df.mean()) / df.std()
# display(df-df2)
pca = PCA(n_components=2)
pca_fit_transform = pca.fit_transform(df)
pca_fit = pca.fit(df)
print(pca_fit_transform)
print(pca_fit)
#df2=
#print(df2)

loadings = pd.DataFrame(pca.components_.T,
columns=['PC%s' % _ for _ in range(len(df.columns))],
index=df.columns)
print(loadings)

# df = pd.DataFrame(data=np.random.normal(0, 1, (20, 10)))
display(df)

display(df_normalized)
pca = PCA(n_components=df.shape[1])
pca.fit(df_normalized)

def modelDf(df,tsne=True) :
    if tsne :
        # def plotTSNE(data,n_components=2,perplexity=3,random_state=10):
        # tsne = TSNE(n_components=n_components,perplexity=perplexity,random_state=random_state) # , random_state=100
        tsne = TSNE(n_components=3, random_state=10,perplexity = 20,n_iter =500)
        ret = tsne.fit_transform(df)
        # tsne.kl_divergence_
    else:
        pca = PCA(n_components=2)
        ret = pca.fit_transform(df)
        
#         ipca = IncrementalPCA(n_components=n_components, batch_size=10)
#         X_ipca = ipca.fit_transform(X)


    return ret

# folder_path="C:/Users/User/OneDrive/Desktop/article/file_2/join_2_df/bin/"
# filename="join_2_test_old" # join_2_test2
# df=openDFcsv(folder_path,filename)
# num_sel = 1000
# df = df.head(num_sel)
# conf2 = createdefconfdict()
# conf = {"x":'words',"y":"sentences","z":"noun_phrases",'c':"category"}#,'title':"title",'xtitle':"xtitle",'ytitle':"ytitle"
# final_con = conf2 | conf
# plot3D(df,final_con)
# plot2D(df,final_con)

m = np.isin(out,["digest"]).sum() ==1
print(m)
print(df["count"].sum())
source_limit = 100#int(df.shape[0]*0.001)
print(source_limit)
source_limiy_count = int(df.iloc[[int(-source_limit)]]["count"].tolist()[0])
df = df[df['count'].between(source_limiy_count, 1000000)]
display(df)
print(df["count"].sum())

def plot2Dmin1(data, df) :#,conf_dict={}
    fig = px.scatter(x=data[:, 0], y=data[:, 1],color=df["category"]) #, color=y
    fig.show()
    
def plot2Dmin2(data) :
    fig = px.scatter(x=data[:, 0], y=data[:, 1])# x="Output Componenet #1", y="Output Componenet #2", size=10)
    fig.show()
    
def plot2Dmin3(df) :
    fig = px.scatter(df, x=0, y=1, color="category", size="text_len", size_max=100, opacity=0.5) #,size=conf_dict["size"],symbol=conf_dict["symbol"],text=conf_dict["text"], hover_name=conf_dict["h_name"], hover_data=conf_dict["h_data"], custom_data=conf_dict["c_data"], size_max=10, opacity=0.9) #
    fig.update_layout(title=conf_dict["title"], xaxis_title=conf_dict["xtitle"], yaxis_title=conf_dict["ytitle"])
    fig.update_layout(height=2000,width=2000,autosize=False) # remove height=800
    # fig.show(renderer="browser")
    fig.show()
    
def plot2Dmini2(df) :
    fig = px.scatter(df, x=0, y=1)#, color=conf_dict["c"],size=conf_dict["size"],symbol=conf_dict["symbol"],text=conf_dict["text"], hover_name=conf_dict["h_name"], hover_data=conf_dict["h_data"], custom_data=conf_dict["c_data"], size_max=10, opacity=0.9) #
    # fig.update_layout(title=conf_dict["title"], xaxis_title=conf_dict["xtitle"], yaxis_title=conf_dict["ytitle"]) 
    #fig.update_layout(autosize=True, width=500,height=500)
    #fig.show()
    # fig.write_image("C:/Users/User/OneDrive/Desktop/article/file_2/join_2_df/embd/fig.png") 
    #fig.write_html("C:/Users/User/OneDrive/Desktop/article/file_2/join_2_df/embd/fig.html")
    #fig.write_image("C:/Users/User/OneDrive/Desktop/article/file_2/join_2_df/embd/fig.png")
    savePlot(fig)
    
from transformers import pipeline

sentiment_pipeline = pipeline(model="FacebookAI/roberta-large-mnli")
sp = sentiment_pipeline("test")
print(sp)

d = {"x":'Polarity_per_count',"y":"Subjectivity_per_count",'z':"Positivity_per_count",'c':"category",'size':"count",'symbol':"category"}

cache_path = tempfile.gettempdir()  # we use a temporary folder here
X, _ = make_regression(n_samples=50, n_features=25, random_state=0)
print(X.shape)
estimator = make_pipeline(
     KNeighborsTransformer(mode='distance'),
     Isomap(n_components=3, metric='precomputed'),
     memory=cache_path)
X_embedded = estimator.fit_transform(X)
print(X)

In [None]:
def parse_keywords_list(df, column_name="keywords_list",titleFlag=False,entry_limit=1000,output_df=False):#51400
    out = df[column_name].to_numpy()[0:entry_limit]
    union_list = [[] for i in range(entry_limit)]
    count = 0
    for entry in out :
        add_flag=True
        if not titleFlag :
            if cleanString(entry) !="" :
                clean_list = cleanString(entry).split(', ')
                parsed_list = []
                for string in clean_list :
                    cl_string = cleanString(string)
                    if cl_string != "" :
                        parsed_list.append(cl_string) 
            else :
                add_flag=False
        else :
            parsed_list = decomposeTitle(entry)
        union_list[count] = parsed_list
        count = count +1
    if output_df :
        return pd.DataFrame(par_list), union_list
    else :
        return union_list

def flattenMatrix(mat):
    out_list = []
    for li in mat :
        out_list = out_list + li
    return out_list
def getMostCommunKeywords(union_list_np, source_limit=100,return_word_list=True,display=True) :
    df = pd.DataFrame(union_list_np, columns=['keyword'])
    df = df['keyword'].value_counts().to_frame("count").sort_values(by=['count'],ascending=True)
    count_sum_before = df["count"].sum()
    entry_sum_before = df.shape[0]
    source_limiy_count = int(df.iloc[[int(-source_limit)]]["count"].tolist()[0])
    df = df[df['count'].between(source_limiy_count, 100000000)]
    count_sum_after = df["count"].sum()
    entry_sum_after = df.shape[0]
    if display :
        print("Unique keywords :",entry_sum_before,"  Sum of occurences :",count_sum_before,"  ("+str(round(count_sum_before/entry_sum_before,2))+" avg)")
        print("Unique keywords :",entry_sum_after,"  Sum of occurences :",count_sum_after,"  ("+str(round(count_sum_after/entry_sum_after,2))+" avg)")
    if return_word_list :
        df2 = df.index.to_numpy()
        return df2
    else :
        return df

def cleanString(string) :
    out_str = string.strip("][”“|’><%—–//").replace("'", "").replace("\\d", "");
    if len(out_str)>3:
        return out_str
    else :
        return ""

def decomposeTitle(string) :
    blob = TextBlob(str(string))
    word_list = []
    for wordT in blob.tags :
        if wordT[1] in TAG_SELECTION_LIST :
            string2 = str(cleanString(wordT[0].lower()))
            if string2 != "" :
                word_list.append(string2)
    return word_list

def df_setup(par_list) :
    df = pd.DataFrame(par_list)#
    col_count = df.shape[1]
    df.rename(columns=lambda x: str(x), inplace=True)#
    df['word_count'] = df.apply(lambda x: x.count(), axis=1)#
    df = df.fillna(value="")#
    df["0"] = df["0"].astype(str)
    df['word_combined'] = df[[str(i) for i in range(col_count)]].agg(' '.join, axis=1)
    display(df)
    print(df[["word_count"]].describe())
    return df



def genrateKeywordExtract(df_main, entry_limit=100, common_word_max = 10, column_name="keywords_list", isTitle = False):
    par_list = parse_keywords_list(df_main,column_name,isTitle,entry_limit)
    df = df_setup(par_list)

    par_list = flattenMatrix(par_list)
    out = getMostCommunKeywords(par_list,common_word_max)
    print(out)
    out = set(out)
    row_list = []
    for i in range(entry_limit) :
        row_list.append(list(out & set(df.loc[i, :].values.flatten().tolist())))
    df_min = df_setup(row_list)
    df = df.join(df_min, how="inner", lsuffix='_f', rsuffix='_s')
    return df
#     plt.scatter(df.0, df.1)
    
# def plot2Dmatplotlib2(df) :
#     plt.scatter(df.0, df.1 df.text_len)


