In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import cm, colors
import glob

from numba import njit, vectorize, float32

from typing import Callable, Optional, Union

import hoomd
import gsd.hoomd

import sys
import time
import pickle
import gc
import pathlib
import os

import signac

from scipy import optimize

os.environ["POLARS_MAX_THREADS"] = "8"
import polars as pl

In [2]:
from monk import nb, prep, pair, render, utils, grid, workflow
import freud

parent = pathlib.Path(os.getcwd()).parent / "config.yaml"
config = workflow.get_config(parent.as_posix())

In [3]:
project: signac.Project = signac.get_project(root=config['root'])
project.doc

{'avail_seed': 11, 'dt': 0.005, 'step_unit': 1000, 'temp_steps': 20, 'equil_time': 40, 'run_time': 10, 'max_alpha_time': 1000, 'alpha_iters': 10, '_status': {}}

In [10]:

soft = []
hard = []

for i, job in enumerate(project):

    if "WCA" in job.sp["pot"]:
        continue

    print(i, job.sp, job.sp["pot"])
    files = glob.glob(job.fn("fixed-analysis/*.parquet"))

    for file in sorted(files):
        temp = float(file.split("/")[-1].split("-")[-1].split(".parquet")[0])
        if temp != 0.337:
            continue
        print(temp)
        df = pl.read_parquet(file, parallel=True, use_pyarrow=True)
        soft.append(df.filter(
            pl.col("phop").gt(0.2),
            pl.col("type").eq(0)
        ))
        hard.append(df.filter(
            pl.col("phop").lt(0.001),
            pl.col("type").eq(0)
        ))
        # ys.append((df["phop"] > 0.2).mean())
        break
    # break



0 {'N': 32768, 'replica': 0, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
1 {'N': 32768, 'replica': 1, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
3 {'N': 32768, 'replica': 8, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
4 {'N': 32768, 'replica': 9, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
5 {'N': 32768, 'replica': 3, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
6 {'N': 32768, 'replica': 2, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
7 {'N': 32768, 'replica': 7, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
8 {'N': 32768, 'replica': 5, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
9 {'N': 32768, 'replica': 6, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
10 {'N': 32768, 'replica': 4, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337


In [20]:
N = 3800
soft_df = pl.concat(soft)
print(soft_df.shape)
soft_df = soft_df.sample(N)
hard_df = pl.concat(hard)
print(hard_df.shape)
hard_df = hard_df.sample(N)
soft_df = soft_df.with_columns(
    y = 1
)
hard_df = hard_df.with_columns(
    y = 0
)

analysis_df = pl.concat([soft_df, hard_df])
# permute
# analysis_df = analysis_df.shuffle()
soft_df.shape, hard_df.shape

(3857, 5)
(26439, 5)


((3800, 6), (3800, 6))

In [28]:
analysis_df.head()

frame,tag,type,phop,sf,y
i64,i64,i64,f32,list[f32],i32
250,8890,0,0.233799,"[0.0, 0.0, … 1.439387]",1
700,24458,0,0.279271,"[0.0, 0.0, … 2.736721]",1
450,28270,0,0.225438,"[0.0, 0.0, … 3.106009]",1
500,25317,0,0.323837,"[0.0, 0.0, … 2.147696]",1
250,14245,0,0.223815,"[0.0, 0.0, … 2.609222]",1


In [21]:
analysis_df["y"].mean()

0.5

In [None]:
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
X = analysis_df["sf"].to_numpy()
y = analysis_df["y"].to_numpy()
# X, y
X, y = shuffle(X, y, random_state=0)
X = list(X)
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
svc = LinearSVC(max_iter=10000)
svc.fit(X_train, y_train)

In [27]:
# get accuracy
from sklearn.metrics import accuracy_score
y_pred = svc.predict(X_test)
accuracy_score(y_test, y_pred)

0.7368421052631579

In [29]:
y_train.mean()

0.500328947368421

In [30]:
# get confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[580, 182],
       [218, 540]])

In [31]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('scaler', scaler), ('svc', svc)])
y_pred_pipe = pipe.predict(scaler.inverse_transform(X_test))

In [32]:
(y_pred_pipe == y_pred).all()

True

In [33]:
with open("svc_type0.pkl", "wb") as fp:
    pickle.dump(pipe, fp)

In [34]:

soft = []
hard = []

for i, job in enumerate(project):

    if "WCA" in job.sp["pot"]:
        continue

    print(i, job.sp, job.sp["pot"])
    files = glob.glob(job.fn("fixed-analysis/*.parquet"))

    for file in sorted(files):
        temp = float(file.split("/")[-1].split("-")[-1].split(".parquet")[0])
        if temp != 0.337:
            continue
        print(temp)
        df = pl.read_parquet(file, parallel=True, use_pyarrow=True)
        soft.append(df.filter(
            pl.col("phop").gt(0.2),
            pl.col("type").eq(1)
        ))
        hard.append(df.filter(
            pl.col("phop").lt(0.001),
            pl.col("type").eq(1)
        ))
        # ys.append((df["phop"] > 0.2).mean())
        break
    # break



0 {'N': 32768, 'replica': 0, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
1 {'N': 32768, 'replica': 1, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
3 {'N': 32768, 'replica': 8, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
4 {'N': 32768, 'replica': 9, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
5 {'N': 32768, 'replica': 3, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
6 {'N': 32768, 'replica': 2, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
7 {'N': 32768, 'replica': 7, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
8 {'N': 32768, 'replica': 5, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
9 {'N': 32768, 'replica': 6, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337
10 {'N': 32768, 'replica': 4, 'rho': 1.2, 'pot': 'KA_LJ'} KA_LJ
0.337


In [35]:
N = 3800
soft_df = pl.concat(soft)
print(soft_df.shape)
soft_df = soft_df.sample(N)
hard_df = pl.concat(hard)
print(hard_df.shape)
hard_df = hard_df.sample(N)
soft_df = soft_df.with_columns(
    y = 1
)
hard_df = hard_df.with_columns(
    y = 0
)

analysis_df = pl.concat([soft_df, hard_df])
# permute
# analysis_df = analysis_df.shuffle()
soft_df.shape, hard_df.shape

(5812, 5)
(17243, 5)


((3800, 6), (3800, 6))

In [36]:
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
X = analysis_df["sf"].to_numpy()
y = analysis_df["y"].to_numpy()
# X, y
X, y = shuffle(X, y, random_state=0)
X = list(X)
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
svc = LinearSVC(max_iter=10000)
svc.fit(X_train, y_train)



In [37]:
# get accuracy
from sklearn.metrics import accuracy_score
y_pred = svc.predict(X_test)
accuracy_score(y_test, y_pred)

0.7625

In [38]:
y_train.mean()

0.500328947368421

In [39]:
# get confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[579, 183],
       [178, 580]])

In [40]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('scaler', scaler), ('svc', svc)])
y_pred_pipe = pipe.predict(scaler.inverse_transform(X_test))

In [41]:
(y_pred_pipe == y_pred).all()

True

In [42]:
with open("svc_type1.pkl", "wb") as fp:
    pickle.dump(pipe, fp)