In [None]:
import json
import pprint
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation  
import matplotlib.pyplot as plt  
from sklearn.metrics import classification_report, confusion_matrix

pp = pprint.PrettyPrinter(indent=4)

In [None]:
#df = pd.read_csv('/root/data/small_pen_data_collection/freckles.csv', names=['id', 'file', 'eye_coordinate', 'freckle_coordinates'])
df = pd.read_csv('/root/data/reidentification/freckles.csv', names=['id', 'file', 'eye_coordinate', 'freckle_coordinates'])

In [None]:
df.loc[0]

len(df)

In [None]:
def get_id_counts():
    fish_ids = np.unique(df['id'])

    id_count = {}

    for id in fish_ids:
        coordinates = df[df['id'] == id]['freckle_coordinates']
        non_zero_coordinates = [coordinate for coordinate in coordinates if len(json.loads(coordinate)) > 0]
        avg_coordinate_length = np.mean([ len(json.loads(coordinate)) for coordinate in non_zero_coordinates])
        coordinate_10 = np.percentile([ len(json.loads(coordinate)) for coordinate in non_zero_coordinates], 10)
        coordinate_90 = np.percentile([ len(json.loads(coordinate)) for coordinate in non_zero_coordinates], 90)
        id_count[id] = (len(df[df['id'] == id]), avg_coordinate_length, coordinate_10, coordinate_90)
        
        print('%i, %i, %i, %i' % (id, avg_coordinate_length, coordinate_10, coordinate_90))

    return id_count

get_id_counts()

In [None]:
same_fish = df[df['id'] == 181016010007]

In [None]:
#for datum in same_fish:

datum = same_fish.iloc[0]

In [None]:
eye_coordinate = json.loads(datum['eye_coordinate'])
freckle_coordinates = json.loads(datum['freckle_coordinates'])

In [None]:
pca = PCA(n_components=2)

pca.fit(freckle_coordinates)

components = pca.components_

newEyeCoordinates = np.dot(eye_coordinate, components.T)
newFreckleCoordinates = np.dot(freckle_coordinates, components.T)

In [None]:
newEyeCoordinates

In [None]:
relativeCoordinates = newFreckleCoordinates - newEyeCoordinates

mean = np.mean(relativeCoordinates, axis=0)
stdev = np.std(relativeCoordinates, axis=0)

normalizedCoordinates = (relativeCoordinates - mean) / stdev

In [None]:
kmeans = KMeans(n_clusters=5, random_state=0).fit(normalizedCoordinates)

In [None]:
kmeans.cluster_centers_.flatten()

In [None]:
classifier = KNeighborsClassifier(n_neighbors=5)  

In [None]:
import pdb;

def generate_data(n_clusters, lower_thresh, higher_thresh):
    data = None
    ids = []

    for index, datum in df.iterrows():
        if index % 500 == 0:
            print('Processing %i out of %i' % (index, len(df)))
            
        eye_coordinate = json.loads(datum['eye_coordinate'])
        freckle_coordinates = json.loads(datum['freckle_coordinates'])

        if len(freckle_coordinates) < lower_thresh or len(freckle_coordinates) > higher_thresh:
            continue

        pca = PCA(n_components=2)

        pca.fit(freckle_coordinates)

        components = pca.components_

        newEyeCoordinates = np.dot(eye_coordinate, components.T)
        newFreckleCoordinates = np.dot(freckle_coordinates, components.T)

        relativeCoordinates = newFreckleCoordinates - newEyeCoordinates

        mean = np.mean(relativeCoordinates, axis=0)
        stdev = np.std(relativeCoordinates, axis=0)

        normalizedCoordinates = (relativeCoordinates - mean) / stdev

        kmeans = KMeans(n_clusters=n_clusters).fit(normalizedCoordinates)

        if data is None:
            data = kmeans.cluster_centers_.flatten()
        else:
            data = np.vstack((data, kmeans.cluster_centers_.flatten()))

        ids.append((datum['id'], len(freckle_coordinates)))
    
#     all_ids = np.array([id[0] for id in ids])
#     unique_ids = np.unique(all_ids)
#     max_id_count = 0
#     for id in unique_ids:
#         id_counts = np.sum(all_ids == id)
#         if id_counts > max_id_count:
#             max_id_count = id_counts
            
#     for id in unique_ids:
#         print('Augmenting id %i' % (id, ))

#         id_indices = all_ids == id
#         found_ids = [ myId for myId in ids if myId[0] == id ]
        
#         id_counts = np.sum(all_ids == id)
#         multiplier = max_id_count * 1.0 / id_counts #(1.0 / len(unique_ids)) / (id_counts * 1.0 / len(df))
        
#         for i in range(1, int(multiplier)):
#             data = np.vstack((data, data[np.where(id_indices)[0], :]))
#             ids = np.concatenate((ids, found_ids))
    
    return (data, ids)

In [None]:
X, y = generate_data(4, 20, 100)

In [None]:
X

In [None]:
def run_freckle_detection(n_clusters, max_n_neighbors):
    print('Generating dataset...')
    
    X, y = generate_data(n_clusters, 20, 100)
    
    error = []
    
    for i in range(1, max_n_neighbors): 
        print('Running for %i neighbors' % (i, ))
        
        total_error = []
        
        for j in range(1, 50):
            X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

            y_train_id = np.array([ y[0] for y in y_train ])
            y_test_id = np.array([ y[0] for y in y_test ])
            y_test_len = np.array([ y[1] for y in y_test ])

            # Calculating error for K values between 1 and 40

            knn = KNeighborsClassifier(n_neighbors=i)
            knn.fit(X_train, y_train_id)
            pred_i = knn.predict(X_test)
            total_error.append(np.mean(pred_i != y_test_id))
        
        error.append(np.mean(total_error))

    plt.figure(figsize=(12, 6))  
    plt.plot(range(1, max_n_neighbors), error, color='red', linestyle='dashed', marker='o',  
             markerfacecolor='blue', markersize=10)
    plt.title('Error Rate K Value')  
    plt.xlabel('K Value')  
    plt.ylabel('Mean Error')

In [None]:
run_freckle_detection(4, 20)

In [None]:
X, y = generate_data(4, 20, 100)

In [None]:
all_y_test_id = []
all_y_pred = []

#for i in range(1, 40):
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

y_train_id = np.array([ y[0] for y in y_train ])
y_test_id = np.array([ y[0] for y in y_test ])
y_test_len = np.array([ y[1] for y in y_test ])

knn = KNeighborsClassifier(n_neighbors=1)
#knn = RandomForestClassifier()
knn.fit(X_train, y_train_id)
y_pred = knn.predict(X_test)

all_y_test_id = np.concatenate((all_y_test_id, y_test_id))
all_y_pred = np.concatenate((all_y_pred, y_pred))
    
#y_pred == y_test_id

success_failure = y_pred == y_test_id
success_lens = y_test_len[success_failure == True]
failure_lens = y_test_len[success_failure == False]


#print(confusion_matrix(y_test, y_pred))  
print(classification_report(all_y_test_id, all_y_pred))  

In [None]:

print(np.vstack((y_pred, y_test_id, success_failure, y_test_len)).T)

np.sum(all_y_test_id == all_y_pred)
len(all_y_pred)

In [None]:
id_count = {}

for myY in y:
    id = myY[0]
    
    if id in id_count:
        id_count[id] = id_count[id] + 1
    else:
        id_count[id] = 1
        
pp.pprint(id_count)

pp.pprint(get_id_counts())

In [None]:
plt.scatter(y_test_len, y_pred == y_test_id)
plt.show()

percentiles = []
success_percentiles = []
failure_percentiles = []

for i in range(0, 100, 5):
    p1 = np.percentile(success_lens, i) # return 50th percentile, e.g median.
    p2 = np.percentile(failure_lens, i) # return 50th percentile, e.g median.
    
    percentiles.append(i)
    success_percentiles.append(p1)
    failure_percentiles.append(p2)
    
    print('%0.2f: %0.2f, %0.2f' % (i, p1, p2))
    
plt.plot(percentiles, success_percentiles)
plt.plot(percentiles, failure_percentiles)
plt.show()