In [None]:
import random

import matplotlib.pyplot as plt
import numpy
import pandas
import sklearn.cluster

%matplotlib inline
plt.rcParams['figure.figsize'] = [16, 5]
plt.style.use('bmh')
# plt.style.available

In [None]:
data_file = "longleaf_job_data_fy2018_res.csv"
n = 1000  # every N line, or random select of 1 / nth the size

num_lines = sum(1 for l in open(data_file))
sample_size = int(num_lines / n)

# skip_index = [x for x in range(1, num_lines) if x % n != 0] # every N lines
skip_index = random.sample(range(1, num_lines), num_lines - sample_size)  # random sample

df = pandas.read_csv(data_file, sep='|', skiprows=skip_index)
len(df)

In [None]:
# Clean Data 


# select SUM((cpus_req*(time_end - time_start)/3600)) AS sum_cpu_time,SUM((IF(mem_req > 2147483648, IF(mem_req > 9223372036854775808, mem_req-9223372036854775808, mem_req-2147483648), mem_req)/1024)*((time_end - time_start)/3600)) as sum_mem_time  from longleaf_job_table WHERE time_start != 0 AND (time_start > 1498881600 AND time_start < 1517461200) AND (time_end > time_start) AND ((HOUR(FROM_UNIXTIME(((time_start + time_end)/2), '%H:%i:%s')) >= 9 AND HOUR(FROM_UNIXTIME(((time_start + time_end)/2), '%H:%i:%s')) <= 17) OR (HOUR(FROM_UNIXTIME(((time_start + time_end)*(3/4)), '%H:%i:%s')) >= 9 AND HOUR(FROM_UNIXTIME(((time_start + time_end)*(3/4)), '%H:%i:%s')) <= 17) OR (HOUR(FROM_UNIXTIME(((time_start + time_end)/4), '%H:%i:%s')) >= 9 AND HOUR(FROM_UNIXTIME(((time_start + time_end)/4), '%H:%i:%s')) <= 17)) AND ((DAYOFWEEK(FROM_UNIXTIME(((time_start + time_end)/2), '%Y-%m-%d %H:%i:%s')) > 1 AND DAYOFWEEK(FROM_UNIXTIME(((time_start + time_end)/2), '%Y-%m-%d %H:%i:%s')) < 7 ) OR (DAYOFWEEK(FROM_UNIXTIME(((time_start + time_end)*(3/4)), '%Y-%m-%d %H:%i:%s')) > 1 AND DAYOFWEEK(FROM_UNIXTIME(((time_start + time_end)*(3/4)), '%Y-%m-%d %H:%i:%s')) < 7 ) OR (DAYOFWEEK(FROM_UNIXTIME(((time_start + time_end)/4), '%Y-%m-%d %H:%i:%s')) > 1 AND DAYOFWEEK(FROM_UNIXTIME(((time_start + time_end)/4), '%Y-%m-%d %H:%i:%s')) < 7 ))
df['mem_req_mod'] = df['mem_req'].apply(lambda x: x - 9223372036854775808 if x > 9223372036854775808 else (x - 2147483648 if x > 2147483648 else x))
df['mem_req_mod'] = df['mem_req_mod'] / 1024

df['cpus_per_node'] = df['cpus_req'] / df['nodes_alloc']

df['run_time_hrs'] = (df['time_end'] - df['time_start']) / 3600
df['timelimit_hrs'] = df['timelimit'] / 60

df = df[df['mem_req_mod'] <= 3000]
df = df[df['time_start'] > 0]
df = df[(df['time_end'] - df['time_start']) > 4]
df.head()

In [None]:
jitter_cpus = df['cpus_per_node'] + numpy.random.normal(1, 0.4, len(df))
jitter_mem = df['mem_req_mod'] + numpy.random.normal(1, 0.4, len(df))
plt.scatter(jitter_cpus, jitter_mem, alpha=.2)

In [None]:
df[['cpus_req','cpus_per_node', 'mem_req_mod', 'timelimit_hrs', 'run_time_hrs']].describe()  

In [None]:
# Look only at "general" partition
general_df = df.where(df['partition'] == 'general').dropna()
general_df[['cpus_req', 'cpus_per_node', 'mem_req_mod', 'timelimit_hrs']].describe()

In [None]:
jitter_cpus = general_df['cpus_per_node'] + numpy.random.normal(1, 0.4, len(general_df))
jitter_mem = general_df['mem_req_mod'] + numpy.random.normal(1, 0.4, len(general_df))
plt.scatter(jitter_cpus, jitter_mem, alpha=.2)

In [None]:
general_df['mem_req_mod'].plot(kind='hist', bins=40, density=True) 
#df['mem_req_mod'].hist(cumulative=True, histtype='step', density=1, bins=100)

In [None]:
Nc = range(1,10)

X = numpy.array(list(zip(general_df['cpus_per_node'], general_df['mem_req_mod'])))

# kmeans = [sklearn.cluster.KMeans(n_clusters=i) for i in Nc]
# score = [kmeans[i].fit(X).score(X) for i in range(len(kmeans))]
# plt.plot(Nc,  score, 'x')

from scipy.spatial.distance import cdist
d = []
for n in Nc:
    kmeans = sklearn.cluster.KMeans(n_clusters=n).fit(X)
    #d.append(sum(numpy.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
    d.append(kmeans.inertia_)
plt.plot(Nc, d, 'x-')

In [None]:
kmeans = sklearn.cluster.KMeans(n_clusters=3).fit(X)
labels = kmeans.predict(X)
C = kmeans.cluster_centers_

jitter_cpus = general_df['cpus_per_node'] + numpy.random.normal(1, 0.4, len(general_df))
jitter_mem = general_df['mem_req_mod'] + numpy.random.normal(1, 0.4, len(general_df))
plt.scatter(jitter_cpus, jitter_mem, alpha=.2, c=labels)
plt.scatter(C[:, 0], C[:, 1], marker='*', c='#050505', s=500)

In [None]:
general_df['labels'] = labels
general_df.groupby('labels').sum()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

In [None]:
ax = Axes3D(plt.figure())
ax.scatter(general_df['cpus_per_node'], general_df['mem_req_mod'], general_df['timelimit_hrs'])

In [None]:
Nc = range(1,10)

X = numpy.array(list(zip(general_df['cpus_per_node'], general_df['mem_req_mod'], general_df['run_time_hrs'])))

# kmeans = [sklearn.cluster.KMeans(n_clusters=i) for i in Nc]
# score = [kmeans[i].fit(X).score(X) for i in range(len(kmeans))]
# plt.plot(Nc,  score, 'x')

from scipy.spatial.distance import cdist
d = []
for n in Nc:
    kmeans = sklearn.cluster.KMeans(n_clusters=n).fit(X)
    #d.append(sum(numpy.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
    d.append(kmeans.inertia_)
plt.plot(Nc, d, 'x-')

In [None]:
kmeans = sklearn.cluster.KMeans(n_clusters=5).fit(X)
labels = kmeans.predict(X)
C = kmeans.cluster_centers_

jitter_cpus = general_df['cpus_per_node'] + numpy.random.normal(1, 0.4, len(general_df))
jitter_mem = general_df['mem_req_mod'] + numpy.random.normal(1, 0.4, len(general_df))
jitter_time = general_df['timelimit_hrs'] + numpy.random.normal(1, 0.4, len(general_df))

ax = Axes3D(plt.figure())

ax.scatter(jitter_cpus, jitter_mem, jitter_time, alpha=.2, c=labels)
ax.scatter(C[:, 0], C[:, 1], C[:,2], marker='*', c='#050505', s=500)

In [None]:
general_df['labels'] = labels
general_df.groupby('labels').sum()

In [None]:
from sklearn.preprocessing import StandardScaler

features = ['cpus_per_node', 'mem_req_mod', 'run_time_hrs']

# Separating out the features
x = general_df.loc[:, features].values

# Separating out the target
#y = general_df.loc[:,['target']].values

# Standardizing the features
xx = StandardScaler().fit_transform(x)
general_df = pandas.DataFrame(data=xx, columns=features)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)

pc = pca.fit_transform(xx)

p_df = pandas.DataFrame(data = pc, columns = ['pc1','pc2','pc3'])
p_df.head()

In [None]:
jitter_pc1 = p_df['pc2'] + numpy.random.normal(1, 0.4, len(general_df))
jitter_pc2 = p_df['pc3'] + numpy.random.normal(1, 0.4, len(general_df))
plt.scatter(jitter_pc1, jitter_pc2, alpha=.2)

In [None]:
sum(pca.explained_variance_ratio_) * 100

In [None]:
general_df.head()