# Verify Model with MinMax

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
schemas_path = "../data/schemas.txt"
# Create the dataframe
df = pd.read_csv(schemas_path, sep=";")
df.head()

Unnamed: 0,Type,Modulus,Size,Skip Values,Skip Type,Result
0,Circles,3,52,51015,original,0
1,Circles,6,6,2345678,v2,1
2,Circles,16,4,1020304050607080,original,0
3,Circles,3,40,510152025,original,0
4,Circles,15,55,2345678,original,0


In [3]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

In [4]:
def get_mean(data):
    """
    Take a string of numbers seperated by commas.
    Split the string by the commas.
    Find the mean of the numbers.
    """
    numbers = list(map(int, data.split(",")))
    return np.mean(numbers)

In [5]:
def get_average(data):
    """
    Take a string of numbers seperated by commas.
    Splt the string by the commas.
    Find the average of the numbers
    """
    numbers = list(map(int, data.split(",")))
    return np.average(numbers)

In [6]:
def get_type(data):
    """
    Return the Schema type as a number.
    Circles => 0
    Curves => 1
    Squares => 2
    Stripes => 3
    """
    if data == "Circles":
        return 0
    elif data == "Curves":
        return 1
    elif data == "Squares":
        return 2
    elif data == "Stripes":
        return 3
    elif data == 'Space':
        return 4
    elif data == 'GSquares':
        return 5

In [7]:
def get_skip_type(data):
    """
    Return the skip type as a number.
    original => 0
    v2 => 1
    """
    if data == "original":
        return 0
    elif data == "v2":
        return 1

In [8]:
# Format data

# Find the average of the skip values
df['Skip Values'] = df['Skip Values'].map(lambda x: get_mean(x))
df['Type'] = df['Type'].map(lambda x: get_type(x))
# df.Type = df.Type.map({'Circles': 0, 'Curves': 1, 'Squares': 2, 'Stripes': 3})
df.Result = df.Result.map(lambda x: 1 if x == 0 else 0)
df['Skip Type'] = df['Skip Type'].map(lambda x: get_skip_type(x))

In [9]:
scaler = MinMaxScaler()
knn = KNeighborsClassifier(n_neighbors=8)
kmeans = KMeans(n_clusters=8)

In [10]:
# Cluster the data
features = ["Modulus", "Size", "Skip Values", "Type", "Skip Type"]
kmeans.fit(df[features])
df["Cluster"] = kmeans.predict(df[features])
df.head()

Unnamed: 0,Type,Modulus,Size,Skip Values,Skip Type,Result,Cluster
0,0,3,52,10.0,0,1,6
1,0,6,6,5.0,1,0,2
2,0,16,4,45.0,0,1,5
3,0,3,40,15.0,0,1,6
4,0,15,55,5.0,0,1,3


In [11]:
# df = df.drop(columns=['Modulus', 'Size', 'Skip Values'], axis=1)
features = [["Modulus", "Size", "Skip Values", "Type", "Cluster"]]
for feature in features:
    df[feature] = scaler.fit_transform(df[feature])
df.head()
# df = df.drop(columns=["Modulus", "Size", "Skip Values"], axis=1)
# df.head()

Unnamed: 0,Type,Modulus,Size,Skip Values,Skip Type,Result,Cluster
0,0.0,0.037037,0.520408,0.204545,0,1,0.857143
1,0.0,0.148148,0.05102,0.090909,1,0,0.285714
2,0.0,0.518519,0.030612,1.0,0,1,0.714286
3,0.0,0.037037,0.397959,0.318182,0,1,0.857143
4,0.0,0.481481,0.55102,0.090909,0,1,0.428571


In [12]:
# Seperate data
X = df.drop(columns=["Result"], axis=1)
y = df.Result
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [13]:
knn.fit(X_train.values, y_train.values)
y_pred = knn.predict(X_test.values)

In [14]:
# Check accuracy, score, and roc_auc_score
print(accuracy_score(y_test.values, y_pred))
print(knn.score(X_test.values, y_test.values))
print(roc_auc_score(y_test.values, y_pred))

0.8058035714285714
0.8058035714285714
0.7868259563911738
