In [1]:
import secrets

import sqlalchemy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

import webbrowser
import plotly.graph_objs as go


In [2]:
DB_NAME = 'chainsync_preprod'

num_chars_per_tokens = 2

cardanoscan_url = 'https://cardanoscan.io/transaction/'
open_url = 'https://mainnet.marlowescan.com/contractView?contractId='

# cardanoscan_url = 'https://preprod.cardanoscan.io/transaction/'
# open_url = 'https://preprod.marlowescan.com/contractView?contractId='


In [3]:
engine = sqlalchemy.create_engine('postgresql://'+secrets.DB_USER+':'+secrets.DB_PASSWORD+'@'+secrets.DB_HOST+'/'+DB_NAME)

In [4]:
df = pd.read_sql('SELECT * FROM analysis.contracts', engine)

KeyboardInterrupt: 

In [None]:
df.info(memory_usage="deep")

In [None]:
def stringToSent(val):
    words = [(val[i:i+num_chars_per_tokens]) for i in range(0, len(val), num_chars_per_tokens)]
    sentence = ' '.join(words)
    return sentence

df['tokens'] = df.apply(lambda row: stringToSent(row['hex']),axis=1)
df['length'] = df.apply(lambda row: len(row['tokens'].split()),axis=1)

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['tokens'])
print(tfidf_matrix.shape)

In [None]:
clf = TruncatedSVD(2)
pca = clf.fit_transform(tfidf_matrix)
print(pca.shape)
df['x'] = pca[:,0]
df['y'] = pca[:,1]

In [11]:
df_export = df[['id','blockno','x','y']]
df_export = df_export.rename(columns={"txid": "id"})
df_export

Unnamed: 0,id,blockno,x,y
0,006fb9454cc3077a249f47715d2661f3c3d1a97ce33a64...,9075008,0.841851,0.065047
1,00b2339dd126174be2bf71718749dd4794c44577b33543...,8421097,0.875163,-0.066107
2,01700747616e2a757eb56ec9ea9a825701ce89b23c4840...,8589185,0.875095,-0.035465
3,01cce8dfcfd42b4f7e07d74d350b75ba076569e0bf6f56...,8589187,0.870097,-0.097074
4,0268ada098bbec63ac00404e666a1807ecdec7df318422...,8589187,0.861854,-0.072613
...,...,...,...,...
445,fd60a7dfe7b10eab3fa308aa06c0ddd2bd6ce7f6e8ae4f...,8589188,0.886360,-0.032520
446,fd7a3bccfa5ceba311e0d1797cc7dd0ba363c93f6391b7...,8589187,0.874439,-0.068447
447,fdd3488f7f62be35ec689ab9516f2a8e7f108b961b69a6...,8421097,0.834131,-0.030260
448,fe3caea06e27c1ce3cccbda97fde4ff563f3c436eb32d6...,8589186,0.874287,-0.052896


In [16]:
df_export.to_sql(name='contracts_clusters', con=engine, if_exists='replace', schema='analysis', index=False)

450

In [8]:
# # get random rows, export for csv
# random_df = df.sample(n = 600)
# random_df = random_df[['txid_hex','length']]

# def appendUrl(val):
#     return open_url + val + '%231'
# random_df['url'] = df.apply(lambda row: appendUrl(row['txid_hex']),axis=1)

# random_df.to_csv('for_labeling.csv')
# # random_df

In [17]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X, y = load_iris(return_X_y=True)





In [None]:
clf = LogisticRegression(random_state=0).fit(X, y)
clf.predict(X[:2, :])
clf.predict_proba(X[:2, :])
clf.score(X, y)

In [19]:
print(X.shape)

(150, 4)


In [4]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])