# Clustering

In [1]:
import pandas as pd
import matplotlib 
from matplotlib import pyplot as plt
import numpy as np

%matplotlib inline
from sklearn.cluster import KMeans, MiniBatchKMeans

In [None]:
df = pd.read_csv('../data/curr_httpgetmt.csv.gz', compression='gzip', error_bad_lines=False)
df = df.loc[:,['unit_id','dtime','target','address','bytes_sec']]
df.head(10)

In [None]:
df['unit_id'].nunique()

df.shape()

In [None]:
max_df = df.groupby(['unit_id']).max()
max_df = max_df.sort_values(by=['bytes_sec'])
max_df = max_df.replace(0, np.nan).dropna().reset_index()
max_df.drop(columns=['dtime','address'],inplace=True)
max_df.reset_index()
max_df['rank'] = max_df.index
max_df

In [None]:
plt.plot(max_df.index,(max_df['bytes_sec']*8)/(1024*1024),'*')

# Kmeans clustering

In [None]:
s = pd.merge(df,max_df, on=['unit_id']).sort_values(by='rank').reset_index()

In [None]:
cluster = KMeans(n_clusters=3)
cluster.fit(s['bytes_sec_y'].values.reshape(-1,1))

In [None]:
import matplotlib.cm as cm

colormap = cm.rainbow(np.linspace(0, 1, len(cluster.cluster_centers_)))
plt.scatter(s['rank']/len(max_df),(s['bytes_sec_x']*8)/(1024*1024),c=colormap[cluster.labels_])

# Density and Distribution-Based Clustering

In [None]:
import pandas as pd
import numpy as np
from numpy import array, linspace

from matplotlib import pyplot as plt
from matplotlib.pyplot import plot
import seaborn as sns; sns.set()
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KernelDensity
from sklearn.cluster import KMeans, DBSCAN, MeanShift
from sklearn.mixture import GaussianMixture as GMM

## Data: FCC Measuring Broadband America 

load the throughput and latency data from the [FCC Measuring Broadband America program](https://www.fcc.gov/oet/mba/raw-data-releases) from March 2020.

In [None]:
df = pd.read_csv('../data/curr_httpgetmt.csv.gz', compression='gzip', error_bad_lines=False)
df_backup = df.copy()

df = df.loc[:,['unit_id','dtime','target','address','bytes_sec']]
df = df[(df['dtime'] < '2020-03-02') & (df['target']=='samknows1.nyc2.level3.net')]
df.head(10)

In [None]:
# Take the mean throughput measurement for each unit.
mean_df = df.groupby(['unit_id']).mean()
mean_df = mean_df.sort_values(by=['bytes_sec'])

# Drop garbage data and unnecessary colums.
mean_df = mean_df.replace(0, np.nan).dropna().reset_index()
#mean_df.drop(columns=['dtime','address'],inplace=True)

# Create a rank so that we plot in order of increasing throughput.
mean_df.reset_index()
mean_df['rank'] = mean_df.index

# Create a column for "speed" which is throughput in megabits per second (a common metric).
mean_df['speed'] = (mean_df['bytes_sec']*8)/(1024*1024)


plt.plot(mean_df.index,mean_df['speed'],'*')
plt.xlabel('Throughput (Mbps)')
plt.show()

In [None]:
dfp = pd.read_csv('../data/curr_ping.csv.gz', compression='gzip', error_bad_lines=False)
dfp_backup = dfp.copy

dfp = dfp.loc[:,['unit_id','dtime','target','rtt_avg']]
dfp = dfp[(dfp['dtime'] < '2020-03-02') & (dfp['target']=='samknows1.nyc2.level3.net')]

mean_dfp = dfp.groupby(['unit_id']).mean()
mean_dfp = mean_dfp.sort_values(by=['rtt_avg'])
mean_dfp = mean_dfp.replace(0, np.nan).dropna().reset_index()

mean_dfp.head(10)

### Latency: 1-D Kernel Density Estimation

In [None]:
latency = mean_dfp['rtt_avg'].values.reshape(-1,1)
kde_l = KernelDensity(kernel='gaussian', bandwidth=10).fit(latency)
e_l = np.exp(kde_l.score_samples(latency))
plt.xscale('log')
plt.xlabel('Latency (milliseconds)')
plt.plot(latency/1000,e_l,'-')
plt.show()

Clustering

In [None]:
all_df = mean_df.merge(mean_dfp,on='unit_id')
l = (all_df['rtt_avg']/1000).values.reshape(-1,1)
s = all_df['speed'].values.reshape(-1,1)
plt.xscale('log')
plt.xlabel('Latency (milliseconds)')
plt.ylabel('Throughput (Mbps)')

plt.plot(l,s,'*')
plt.show()

In [None]:
cluster = DBSCAN(min_samples=10, eps=0.5)
X = all_df.loc[:,['rtt_avg','speed']]

scaler = StandardScaler()
X_std = scaler.fit_transform(X)
model = cluster.fit(X_std)

labels = model.labels_

plt.xscale('log')
plt.xlabel('Latency (milliseconds)')
plt.ylabel('Throughput (Mbps)')
plt.scatter(l,s, c=labels, cmap='brg')
plt.show()

In [None]:
# Generate some data
from sklearn.datasets import make_blobs
X, y_true = make_blobs(n_samples=400, centers=4,
                       cluster_std=0.60, random_state=0)
X = X[:, ::-1] # flip axes for better plotting

kmeans = KMeans(4, random_state=0)
labels = kmeans.fit(X).predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis');

In [None]:
from scipy.spatial.distance import cdist

def plot_kmeans(kmeans, X, n_clusters=4, rseed=0, ax=None):
    labels = kmeans.fit_predict(X)

    # plot the input data
    ax = ax or plt.gca()
    ax.axis('equal')
    ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)

    # plot the representation of the KMeans model
    centers = kmeans.cluster_centers_
    radii = [cdist(X[labels == i], [center]).max()
             for i, center in enumerate(centers)]
    for c, r in zip(centers, radii):
        ax.add_patch(plt.Circle(c, r, fc='#CCCCCC', lw=3, alpha=0.5, zorder=1))


kmeans = KMeans(n_clusters=4, random_state=0)
plot_kmeans(kmeans, X)

In [None]:
gmm = GMM(n_components=4).fit(X)
labels = gmm.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis');