In [None]:
import matplotlib
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.rcParams["figure.figsize"] = [8, 7]

## 1. Distance metrix

### 1.1 Manhatten distance

In [None]:
import random

def manhatten_distance(point1, point2):
    return sum(abs(a-b) for a, b in zip(point1, point2))

points = []
for r in range(2):
    points.append([random.randint(1, 10),
                   random.randint(1, 10),
                   random.randint(1, 10)])

dis = manhatten_distance(points[0], points[1])

print(dis)

### 1.2 Euclidean distance

In [None]:
import random
from scipy.spatial import distance

points = []
for r in range(2):
    points.append([random.randint(1, 10),
                   random.randint(1, 10),
                   random.randint(1, 10)])
    
dis = distance.euclidean(points[0], points[1])

print(dis)

In [None]:
import random
import numpy as np

points = []
for r in range(2):
    points.append([random.randint(1, 10),
                   random.randint(1, 10),
                   random.randint(1, 10)])
    
points = np.array(points)
dis = np.linalg.norm(points[0] - points[1])

print(dis)

In [None]:
import random
import numpy as np

def euclidean_distance(point1, point2):
    return np.sqrt(np.sum([(a-b)*(a-b) for a, b in zip(point1, point2)]))

points = []
for r in range(2):
    points.append([random.randint(1, 10),
                   random.randint(1, 10),
                   random.randint(1, 10)])
    
dis = euclidean_distance(points[0], points[1])
print(dis)

## 2. Clustering

### 2.1 k nearest neighbors

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [None]:
df = pd.read_csv('../../datasets/usa_election_dataset.csv')

In [None]:
df['winnerc'] = 0
df.loc[lambda x: x['winner'].str.contains('Trump'), 'winnerc'] = 1
df['largecity'] = 0
df.loc[lambda x: x['tot_pop']>x['tot_pop'].mean(), 'largecity'] = 1

In [None]:
df[:5]

In [None]:
indepedents = [
    'tot_pop',
    'yougn',
    'female',
    'black',
]
dependent = 'winnerc'

X = df[indepedents].values
y = df[dependent].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)
y_predict = neigh.predict(X_test)
print(f'model accuracy: {round(metrics.accuracy_score(y_test, y_predict), 3)}')

### 2.2 Hierarchical clustering

In [None]:
import numpy as np
import random
from sklearn.cluster import AgglomerativeClustering
import matplotlib
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.rcParams["figure.figsize"] = [8, 7]

X = []
for r in range(20):
    X.append([random.randint(1, 8),
              random.randint(1, 10)])
for r in range(20):
    X.append([random.randint(10, 20),
              random.randint(12, 20)])
for r in range(20):
    X.append([random.randint(22, 28),
              random.randint(10, 15)])
X = np.array(X)

clustering = AgglomerativeClustering(n_clusters=3).fit(X)

plt.scatter(X[:,0], X[:,1], c=clustering.labels_, cmap='rainbow')

#### 2.2.1 Dendrogram

In [None]:
import numpy as np
import random
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
import matplotlib
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.rcParams["figure.figsize"] = [8, 7]

def plot_dendrogram(model, **kwargs):
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)
    dendrogram(linkage_matrix, **kwargs)
    
X = []
for r in range(20):
    X.append([random.randint(1, 8),
              random.randint(1, 10)])
for r in range(20):
    X.append([random.randint(10, 20),
              random.randint(12, 20)])
for r in range(20):
    X.append([random.randint(22, 28),
              random.randint(10, 15)])
X = np.array(X)
    
clustering = AgglomerativeClustering(distance_threshold=0, n_clusters=None).fit(X)

plt.title('Hierarchical Clustering Dendrogram')
plot_dendrogram(clustering, truncate_mode='level', p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

### 2.3 K means clustering

In [None]:
import random
import numpy as np
from sklearn.cluster import KMeans
import matplotlib
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.rcParams["figure.figsize"] = [8, 7]

X = []
for r in range(20):
    X.append([random.randint(1, 8),
              random.randint(1, 10)])
for r in range(20):
    X.append([random.randint(10, 20),
              random.randint(12, 20)])
for r in range(20):
    X.append([random.randint(22, 28),
              random.randint(3, 10)])
X = np.array(X)

kmeans = KMeans(n_clusters=3).fit(X)

plt.scatter(X[:,0], X[:,1], c=kmeans.labels_, cmap='rainbow')

#### 2.3.1 elbow

In [None]:
import random
import numpy as np
from sklearn.cluster import KMeans
import matplotlib
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.rcParams["figure.figsize"] = [8, 7]

X = []
for r in range(20):
    X.append([random.randint(1, 8),
              random.randint(1, 10)])
for r in range(20):
    X.append([random.randint(10, 20),
              random.randint(12, 20)])
for r in range(20):
    X.append([random.randint(22, 28),
              random.randint(3, 10)])
X = np.array(X)

distances = []
n_cluster_range = range(1, 11)
for k in n_cluster_range:
    kmeans = KMeans(n_clusters=k).fit(X)
    distances.append(kmeans.inertia_)

plt.plot(n_cluster_range, distances, 'bx-')
plt.xlabel('n clusters (k)')
plt.ylabel('Sum of squared distances')
plt.title('Elbow graph for optimal k')
plt.show()