# Data, Machines and the 🐍 
<img src="https://raw.githubusercontent.com/habermanUIUC/CodeStoryLessons/main/lessons/dmap/mls/knn/html/section00.png" align="left"/>

<a id="install"></a>
## Notebook Preparation for Lesson 1•2•3
Each lesson will start with a similar template (given in the course schedule):  
1. **save** to your google drive (copy to drive)<br/><img src="https://raw.githubusercontent.com/habermanUIUC/CodeStoryLessons/main/assets/images/colab/copy-to-drive.png"/>
2. **update** the NET_ID to be your netID (no need to include @illinois.edu)
3. **run** the next cell to install the IDE. <img src="https://raw.githubusercontent.com/habermanUIUC/CodeStoryLessons/main/assets/images/colab/play-button.png"/>

In [0]:
LESSON_ID = 'dmap:mls:knn'   # keep this as is
NET_ID    = 'CHANGE_ME' # CHANGE_ME to your netID (keep the quotes)

def install_ide(net_id, lesson_id):
  import sys
  if 'codestories' not in sys.modules:
      print('installing modules')
      !pip install git+https://mehaberman@bitbucket.org/mehaberman/codestories.git --upgrade &> install.log
  
  from codestories.cs.CodeStories import CodeStory
  return CodeStory(net_id, lesson_id)

ide = install_ide(NET_ID, LESSON_ID)
print(ide.welcome())

# Lesson KNN
(hit ▶ to read the first part of the lesson️)

In [0]:
# run to read the next section
ide.reader.view_section(1)

In [0]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances
 
# force numpy to print w/out scientific notation
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)

def car_distances():
    df_cars = pd.DataFrame([[120000, 11,   '🚗'],   
                            [250000, 11.5, '🚓'], 
                            [175000, 15.8, '🚘'], 
                            [350000, 17,   '🏎'], 
                            [400000, 10,   '🚔']],
                            columns=['miles', 'mpg', 'brand'])
                            
    test_car = pd.DataFrame([[175000, 11, '🚙']], columns=df_cars.columns)
    cols =['miles', 'mpg']
    print(euclidean_distances(df_cars[cols], test_car[cols]))
car_distances()

In [0]:
# run to read the next section
ide.reader.view_section(3)

# Where's the Code?

In [0]:
# run to read the next section
ide.reader.view_section(4)

In [0]:
import LessonUtil as Util
import pandas as pd
import numpy as np


def read_data():
   p = Util.path_for_data('diabetes.csv')
   df = pd.read_csv(p)
   print(df.describe().T) # make the attributes rows (for easy viewing)
   return df
df = read_data()

In [0]:
# run to read the next section
ide.reader.view_section(6)

In [0]:
def prep_data(df):

    # zero is used as a missing measurement
    cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']
    
    # first change those values to np.NaN (a more meaningful value)
    df[cols] = df[cols].replace(0, np.NaN)
    
    # show the count of rows that have columns with null values
    print(df.isnull().sum())
    
    # replace with the mean
    for c in cols:
        mean = df[c].mean(skipna=True)
        df[c].replace(np.NaN, mean, inplace=True)

    return df

df = prep_data(read_data())

In [0]:
# run to read the next section
ide.reader.view_section(8)

In [0]:
def car_distances_part2():
    df_cars = pd.DataFrame([[120000, 11,   '🚗'],   
                            [250000, 11.5, '🚓'], 
                            [175000, 15.8, '🚘'], 
                            [350000, 17,   '🏎'], 
                            [400000, 10,   '🚔']],
                            columns=['miles', 'mpg', 'brand'])
                            
    test_car = pd.DataFrame([[175000, 11, '🚙']], columns=df_cars.columns)
    cols =['miles', 'mpg']

    from sklearn.preprocessing import StandardScaler
    std_scaler = StandardScaler()
    c2 = std_scaler.fit_transform(df_cars[cols])
    t2 = std_scaler.transform(test_car[cols])

    print(euclidean_distances(c2, t2))
    
    
car_distances_part2()

In [0]:
# run to read the next section
ide.reader.view_section(10)

In [0]:
X = df.iloc[:, 0:8]  # all rows, cols 0 through 7
y = df.iloc[:, 8]    # all rows, col 8 is the outcome, the label

In [0]:
# run to read the next section
ide.reader.view_section(12)

In [0]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler() # [-1, 1]
Xt = sc.fit_transform(X)

In [0]:
# run to read the next section
ide.reader.view_section(14)

In [0]:
from sklearn.model_selection import train_test_split

# Hold out 20% of the data for testing
X_train, X_test, y_train, y_test = train_test_split(Xt, y, random_state=0, test_size=0.2)
print(len(X_train), len(X_test))

In [0]:
# run to read the next section
ide.reader.view_section(16)

In [0]:
from sklearn.neighbors import KNeighborsClassifier

def knn_demo():
    # build the model
    knn = KNeighborsClassifier(n_neighbors=11, p=2) # p == 2 euclidean

    # train the model
    ig = knn.fit(X_train, y_train)

    # predict using the model
    y_pred = knn.predict(X_test)
    return y_pred
    
y_pred = knn_demo()

In [0]:
# run to read the next section
ide.reader.view_section(18)

In [0]:
from sklearn import metrics
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

In [0]:
# run to read the next section
ide.reader.view_section(20)

In [0]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

In [0]:
# run to read the next section
ide.reader.view_section(22)

In [0]:
from sklearn.metrics import f1_score
print(f1_score(y_test, y_pred))

In [0]:
# run to read the next section
ide.reader.view_section(24)

# Exercises

In [0]:
# run to read the next section
ide.reader.view_section(25)

In [0]:
# type&run the above example/exercise in this cell

In [0]:
# run to read the next section
ide.reader.view_section(27)

In [0]:
# type&run the above example/exercise in this cell

# Managing Error

In [0]:
# run to read the next section
ide.reader.view_section(29)

# The ML Lexicon 📓

In [0]:
# run to read the next section
ide.reader.view_section(30)

# Lesson Assignment

In [0]:
# run to read the next section
ide.reader.view_section(31)

# Test and Submit

In [0]:
# run to read the next section
ide.reader.view_section(32)

In [0]:
# print(ide.tester.test_notebook()) 
# print(ide.tester.test_notebook(verbose=True)) 

# once you are ready -- run this 
# ide.tester.download_solution()