# Clustering

### K-means

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import random

In [None]:
def generateColors(n):
    li = []
    for i in range(n):
        r = lambda: random.randint(0,255)
        li.append('#%02X%02X%02X' % (r(),r(),r()))
    return li

In [None]:
from sklearn.datasets.samples_generator import make_blobs
X, y = make_blobs(n_samples=100, centers=2, n_features=2, random_state=6)

In [None]:
#before assigning cluster
plt.scatter(X[:,0], X[:,1])

In [None]:
#after assigning clusters
plt.scatter(X[:,0], X[:,1], c = y, cmap = ListedColormap(('red', 'green')))

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
y_hat = kmeans.predict(X)
plt.scatter(X[:,0], X[:,1], c = y_hat, cmap = ListedColormap(('red', 'green')))

### Lets try with 3 clusters

In [None]:
### Make clusters
from sklearn.datasets.samples_generator import make_blobs
X, y = make_blobs(n_samples=100, centers=3, n_features=2, random_state=1)

In [None]:
#before assigning cluster
plt.scatter(X[:,0], X[:,1])

In [None]:
#after assigning clusters
plt.scatter(X[:,0], X[:,1], c = y, cmap = ListedColormap(('red', 'green', 'blue')))

### Lets try k-means

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
y_hat = kmeans.predict(X)
plt.scatter(X[:,0], X[:,1], c = y_hat, cmap = ListedColormap(('red', 'green', 'blue')))

### Harder version of 3 clusters

In [None]:
# 3 clusters Harder
### Make clusters
from sklearn.datasets.samples_generator import make_blobs
X, y = make_blobs(n_samples=100, centers=3, n_features=2, random_state=2)

In [None]:
#before assigning cluster
plt.scatter(X[:,0], X[:,1])

In [None]:
#after assigning clusters
plt.scatter(X[:,0], X[:,1], c = y, cmap = ListedColormap(('red', 'green', 'blue')))

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
y_hat = kmeans.predict(X)
plt.scatter(X[:,0], X[:,1], c = y_hat, cmap = ListedColormap(('red', 'green', 'blue')))

### try an even harder example

In [None]:
from sklearn.datasets.samples_generator import make_blobs
X, y = make_blobs(n_samples=100, centers=3, n_features=2, random_state=55)

In [None]:
#before assigning cluster
plt.scatter(X[:,0], X[:,1])

In [None]:
#after assigning clusters
plt.scatter(X[:,0], X[:,1], c = y, cmap = ListedColormap(('red', 'green', 'blue')))

### Lets look at what k-means thinks

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
y_hat = kmeans.predict(X)
plt.scatter(X[:,0], X[:,1], c = y_hat, cmap = ListedColormap(('red', 'green', 'blue')))

### Lets try with circles

In [None]:
from sklearn.datasets import make_circles
X, y = make_circles(n_samples=1000, noise=0.03, random_state=0)
plt.scatter(X[:,0], X[:,1], c=y, cmap = ListedColormap(('red', 'blue')))

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
y_hat = kmeans.predict(X)
plt.scatter(X[:,0], X[:,1], c = y_hat, cmap = ListedColormap(('red', 'blue')))

In [None]:
### We can see that kmeans is not the best way to cluster this type of data

# Assignment: Try to guess how many clusters are in this graph

In [None]:
import pickle
#from sklearn.datasets.samples_generator import make_blobs
#X, y = make_blobs(n_samples=100, centers=5678, n_features=2, random_state=55)
#with open('clusters.pickle', 'wb') as f:
#    pickle.dump([X, y], f)
    
with open('clusters.pickle', 'rb') as f:
    X, y = pickle.load(f)
    y = None

In [None]:
#after assigning clusters
plt.scatter(X[:,0], X[:,1])

### Try to see what kmeans thinks

In [None]:
kmeans = KMeans(n_clusters=1, random_state=0).fit(X)
y_hat = kmeans.predict(X)
plt.scatter(X[:,0], X[:,1], c = y_hat, cmap = ListedColormap(generateColors(1000)))

# Genetic Algorithm

In [None]:
#genetic algorithm
import numpy as np
#Try to change this:
ref = "THIS IS A TEST!!"

try:
    from randomstate.prng.pcg64 import RandomState
except ImportError:
    print ("""Importing randomstate failed. To fix, try:
    sudo pip install randomstate OR conda install -c dhirschfeld randomstate""")
    import sys
    sys.exit()

gene_bases = [base for base in ' ABCDEFGHIJKLMNOPQRSTUVWXYZ!?@#$%^&*()']

random_seed = 3

size_of_generation = 1000
prngs = [RandomState(random_seed, i) for i in range(size_of_generation)]

def mutate(gene, prng, mutation_rate=0.05):
    copy = ''
    for base in gene:
        if prng.uniform() < mutation_rate:
            copy += prng.choice(gene_bases)
        else:
            copy += base
    return copy


def fitness(gene, reference=ref):
    return sum([1 for base, ref_base in zip(gene, reference) if base == ref_base])

def new_population(parent, mutation_rate=0.05):
    return [mutate(parent, prng, mutation_rate=mutation_rate) for prng in prngs]

def best_in_population(population):
    """return the fittest individual in the population"""
    return population[np.argmax([fitness(individual) for individual in population])]

def get_next_parent(parent, mutation_rate=0.05):
    """evolve a new population from the parent, and find the new fittest individual"""
    return best_in_population(new_population(parent, mutation_rate=mutation_rate))

def weasel_program(mutation_rate=0.05,initial=' '*len(ref)):#must be same size as text we want
    generation = 0
    score = fitness(initial)
    parent = initial
    while score < len(parent):
        print ('%3d  %s  (%d)' % (generation, parent, score))
        parent = get_next_parent(parent)
        generation += 1
        score = fitness(parent)
    print ('%3d  %s  (%d)' % (generation, parent, score))

if __name__ == '__main__':
    import time
    start = time.time()
    weasel_program()
    print ('evolution time:', time.time() - start)
