# Data, Machines and the 🐍 
<img src="https://raw.githubusercontent.com/habermanUIUC/CodeStoryLessons/main/lessons/dmap/mlu/clustering/html/section00.png" align="left"/>

<a id="install"></a>
## Notebook Preparation for Lesson 1•2•3
Each lesson will start with a similar template (given in the course schedule):  
1. **save** to your google drive (copy to drive)<br/><img src="https://raw.githubusercontent.com/habermanUIUC/CodeStoryLessons/main/assets/images/colab/copy-to-drive.png"/>
2. **update** the NET_ID to be your netID (no need to include @illinois.edu)
3. **run** the next cell to install the IDE. <img src="https://raw.githubusercontent.com/habermanUIUC/CodeStoryLessons/main/assets/images/colab/play-button.png"/>

In [0]:
LESSON_ID = 'dmap:mlu:clustering'   # keep this as is
NET_ID    = 'CHANGE_ME' # CHANGE_ME to your netID (keep the quotes)

def install_ide(net_id, lesson_id):
  import sys
  if 'codestories' not in sys.modules:
      print('installing modules')
      !pip install git+https://mehaberman@bitbucket.org/mehaberman/codestories.git --upgrade &> install.log
  
  from codestories.cs.CodeStories import CodeStory
  return CodeStory(net_id, lesson_id)

ide = install_ide(NET_ID, LESSON_ID)
print(ide.welcome())

# Lesson Clustering
(hit ▶ to read the first part of the lesson️)

In [0]:
# run to read the next section
ide.reader.view_section(1)

# finding patterns in a sea of data

In [0]:
# run to read the next section
ide.reader.view_section(2)

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import LessonUtil as Util

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)

def build_dataset():
  p = Util.path_for_data('canon.csv')
  # Importing the dataset
  df = pd.read_csv(p)

  keep = ['a', 'b', 'c', 'd', 'e', 'f']

  return df[keep]
  
data_df = build_dataset()
print(data_df.head(5))

In [0]:
# run to read the next section
ide.reader.view_section(4)

In [0]:
def pair_wise_plots(df):
  axes = pd.plotting.scatter_matrix(df, figsize=(12,12))

pair_wise_plots(data_df)

In [0]:
# run to read the next section
ide.reader.view_section(6)

# An Algorithmic Overview

In [0]:
# run to read the next section
ide.reader.view_section(7)

# K-Means Clustering

In [0]:
# run to read the next section
ide.reader.view_section(8)

In [0]:
def build_dataset(as_is=False):
  p = Util.path_for_data('clean_cars.csv')
  # Importing the dataset
  df = pd.read_csv(p)
  if as_is:
    return df

  drop = ['brand', 'year'] #, 'cylinders']
  return df[df.columns.difference(drop)]
  
data_df = build_dataset()
print(data_df.head(5))

In [0]:
# run to read the next section
ide.reader.view_section(10)

In [0]:
def convert_to_points(df):
    p = []
    for c in df.columns:
      pc = df[c]
      p.append(pc)
    
    points = np.stack(p, axis=1)
    return points

points = convert_to_points(data_df)
print(data_df.head(5))
print(points[0:5,:])

In [0]:
# run to read the next section
ide.reader.view_section(12)

In [0]:
import numpy as np
from sklearn.cluster import KMeans
import matplotlib
import matplotlib.pyplot as plt

def cluster_points(points, K=6):
    km = KMeans(n_clusters=K,    # how many clusters
                max_iter=300,    # iterate 300 times, for a 'single run'
                n_init=100,      # run it 100 times, picking the best one
                init='random',   # pick random centers (provide your own, 'k-means++')
                random_state=42, # pick None to be different each time
                tol=0.0001)      # when to declare convergence between two consecutive iterations
    
    # points needs to be like
    # [x1,y1], [x2, y2], etc
    
    # build the model
    km.fit(points)

    centers = km.cluster_centers_
    print(centers)
    
    # pass the same data through the model, 
    # predict will assign the point to a label (cluster number)
    labels = km.predict(points)
    return km, labels
    
km, labels = cluster_points(points)
print(len(km.cluster_centers_), set(labels))

In [0]:
# run to read the next section
ide.reader.view_section(14)

In [0]:
def unclean_data_demo():
  p = Util.path_for_data('anon.csv')
  tmp_df = pd.read_csv(p)[['a','b','c']]
  print(tmp_df.head(5))
  km, lbls = cluster_points(tmp_df)
  
unclean_data_demo()

In [0]:
# run to read the next section
ide.reader.view_section(16)

In [0]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm

def plot_kmeans(points, centers, labels, x0=0, x1=1, columns=None):
    
    K = len(centers)
    colors=['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6']
    cmap = matplotlib.colors.ListedColormap(colors)
    if len(colors) <= K:
      cmap = cm.Dark2
    
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(6,6))
    for c_id in range(0, K):
        mask = labels == c_id
        c = cmap(c_id)
        axes.scatter(points[mask][:, x0], points[mask][:, x1], color=c, s=25)
        # mark the center
        axes.scatter(centers[c_id, x0], centers[c_id, x1], color=c, s=200, alpha=0.5, edgecolor='black')

    if columns is not None:
      axes.set_xlabel(columns[x0], fontsize=18)
      axes.set_ylabel(columns[x1], fontsize=16)
    axes.grid()

In [0]:
# run to read the next section
ide.reader.view_section(18)

# Finding K

In [0]:
# run to read the next section
ide.reader.view_section(19)

In [0]:
def plot_scores(scores, y="inertia score"):
    fig, axes = plt.subplots(nrows=1, ncols=1)
    axes.plot(scores[:, 0], scores[:, 1], color='blue', marker='x')
    axes.set_ylabel(y, fontsize=18)
    axes.set_xlabel('K', fontsize=18)

def get_inertias(points):
  scores = []
  for k in range(2, 16):
     km, labels = cluster_points(points, K=k)
     scores.append((k, km.inertia_))
  return np.array(scores)
  
# be sure to comment this call out, before testing
# it will cause a timeout during any submissions
inertia_scores = get_inertias(points)
plot_scores(inertia_scores)

In [0]:
# run to read the next section
ide.reader.view_section(21)

In [0]:
from sklearn.metrics import silhouette_score

def get_silhouette_scores(points):
  scores = []
  for k in range(2, 16):
     km, labels = cluster_points(points, K=k)
     s_score = silhouette_score(points,  km.labels_)
     scores.append((k, s_score))
  return np.array(scores)

# be sure to comment this call out, before testing
# it will cause a timeout during any submissions
sscores = get_silhouette_scores(points)
plot_scores(sscores, y='silhouette')

In [0]:
# run to read the next section
ide.reader.view_section(23)

In [0]:
def special_k():
    km, labels = cluster_points(points, K=3)
    print(len(km.cluster_centers_), set(labels))
    plot_kmeans(points, km.cluster_centers_, labels)

special_k()

In [0]:
# run to read the next section
ide.reader.view_section(25)

# External Measures for Cluster Evaluation

In [0]:
# run to read the next section
ide.reader.view_section(26)

In [0]:
def get_distributions(df, points, K=3):
    km, labels = cluster_points(points, K=K)
    centers = km.cluster_centers_

    score = [{} for i in range(0, len(centers))]
    for idx, cluster_num in enumerate(labels):
        predict = cluster_num
        actual = df['brand'][idx]

        s = score[predict]
        v = s.get(actual, 0)
        s[actual] = v + 1

    return score

km, labels = cluster_points(points, K=3)
cars_df = build_dataset(as_is=True)
print(cars_df.groupby(['brand']).size())
print(get_distributions(cars_df, points, K=5))

In [0]:
# run to read the next section
ide.reader.view_section(28)

In [0]:
import sklearn.metrics as metrics
labels_true = ['a','a','a',  'b','b','b']
labels_pred = [0,0,0, 1,1,1]
print(metrics.adjusted_mutual_info_score(labels_true, labels_pred))

In [0]:
# run to read the next section
ide.reader.view_section(30)

# More than one way to cluster

In [0]:
# run to read the next section
ide.reader.view_section(31)

# Lesson Assignment

In [0]:
# run to read the next section
ide.reader.view_section(32)

In [0]:
import LessonUtil as Util
import pandas as pd

def get_robo_dataset():
  path = Util.path_for_data('robo.csv')
  df = pd.read_csv(path)
  return df

pitcher_df = get_robo_dataset()
pitcher_df.head()

In [0]:
# run to read the next section
ide.reader.view_section(34)

In [0]:
# type&run the above example/exercise in this cell

In [0]:
# run to read the next section
ide.reader.view_section(36)

In [0]:
class RoboBatter(object):
    pass

In [0]:
# run to read the next section
ide.reader.view_section(38)

# Test and Submit

In [0]:
# run to read the next section
ide.reader.view_section(39)

In [0]:
# print(ide.tester.test_notebook()) 
# print(ide.tester.test_notebook(verbose=True)) 

# once you are ready -- run this 
# ide.tester.download_solution()