# Verify Model with MinMax

In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [56]:
schemas_path = "../data/schemas.txt"
# Create the dataframe
df = pd.read_csv(schemas_path, sep=";")
df.head()

Unnamed: 0,Type,Modulus,Size,Skip Values,Skip Type,Result
0,Stripes,3,97,10,original,1
1,Stripes,19,41,2345678,original,1
2,Circles,14,16,51015,original,1
3,Circles,4,55,12,original,0
4,Circles,11,11,369121518,v2,1


In [57]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

In [58]:
def get_mean(data):
    """
    Take a string of numbers seperated by commas.
    Split the string by the commas.
    Find the mean of the numbers.
    """
    numbers = list(map(int, data.split(",")))
    return np.mean(numbers)

In [59]:
def get_average(data):
    """
    Take a string of numbers seperated by commas.
    Splt the string by the commas.
    Find the average of the numbers
    """
    numbers = list(map(int, data.split(",")))
    return np.average(numbers)

In [60]:
def get_type(data):
    """
    Return the Schema type as a number.
    Circles => 0
    Curves => 1
    Squares => 2
    Stripes => 3
    """
    if data == "Circles":
        return 0
    elif data == "Curves":
        return 1
    elif data == "Squares":
        return 2
    elif data == "Stripes":
        return 3
    elif data == 'Space':
        return 4

In [61]:
def get_skip_type(data):
    """
    Return the skip type as a number.
    original => 0
    v2 => 1
    """
    if data == "original":
        return 0
    elif data == "v2":
        return 1

In [62]:
# Format data

# Find the average of the skip values
df['Skip Values'] = df['Skip Values'].map(lambda x: get_mean(x))
df['Type'] = df['Type'].map(lambda x: get_type(x))
# df.Type = df.Type.map({'Circles': 0, 'Curves': 1, 'Squares': 2, 'Stripes': 3})
df.Result = df.Result.map(lambda x: 1 if x == 0 else 0)
df['Skip Type'] = df['Skip Type'].map(lambda x: get_skip_type(x))

In [63]:
scaler = MinMaxScaler()
knn = KNeighborsClassifier(n_neighbors=8)
kmeans = KMeans(n_clusters=8)

In [64]:
# Cluster the data
features = ["Modulus", "Size", "Skip Values", "Type", "Skip Type"]
kmeans.fit(df[features])
df["Cluster"] = kmeans.predict(df[features])
df.head()

Unnamed: 0,Type,Modulus,Size,Skip Values,Skip Type,Result,Cluster
0,3,3,97,10.0,0,0,1
1,3,19,41,5.0,0,0,4
2,0,14,16,10.0,0,0,2
3,0,4,55,1.5,0,1,4
4,0,11,11,10.5,1,0,0


In [65]:
# df = df.drop(columns=['Modulus', 'Size', 'Skip Values'], axis=1)
features = [["Modulus", "Size", "Skip Values", "Type", "Cluster"]]
for feature in features:
    df[feature] = scaler.fit_transform(df[feature])
df.head()
# df = df.drop(columns=["Modulus", "Size", "Skip Values"], axis=1)
# df.head()

Unnamed: 0,Type,Modulus,Size,Skip Values,Skip Type,Result,Cluster
0,0.75,0.037037,0.979592,0.204545,0,0,0.142857
1,0.75,0.62963,0.408163,0.090909,0,0,0.571429
2,0.0,0.444444,0.153061,0.204545,0,0,0.285714
3,0.0,0.074074,0.55102,0.011364,0,1,0.571429
4,0.0,0.333333,0.102041,0.215909,1,0,0.0


In [66]:
# Seperate data
X = df.drop(columns=["Result"], axis=1)
y = df.Result
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [67]:
knn.fit(X_train.values, y_train.values)
y_pred = knn.predict(X_test.values)

In [68]:
# Check accuracy, score, and roc_auc_score
print(accuracy_score(y_test.values, y_pred))
print(knn.score(X_test.values, y_test.values))
print(roc_auc_score(y_test.values, y_pred))

0.7754172989377845
0.7754172989377845
0.6856771277502984
