In [None]:
pip install pandas numpy  seaborn missingno matplotlib scikit-learn joblib


In [None]:

import pandas as pd
import json
import os
import missingno as ms
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as skl

In [None]:
dataCsvPath = os.path.join("..","results.csv")
data_df = pd.read_csv(dataCsvPath)
src_paths = list(data_df["src_path"])

print("Number of rows:", data_df.shape[0])
print("Number of columns:", data_df.shape[1])
print("Column names:", list(data_df.columns))

In [None]:
data_df.head()

## Cleaning the data, filtering the non parsed models and removing some columns

In [None]:
data_df =data_df[(data_df["is_parsed"] == True) & (data_df["is_sys_design"] == True)]
df = data_df.loc[:, ~data_df.columns.isin(['src_path', 'conv_path',"src_ext","doc_files"])]
print("Number of rows:", data_df.shape[0])
print("Number of columns:", data_df.shape[1])
df.head()

## Extracting the String Data and Creating a Data Set for that

In [None]:
text_models_data = data_df.loc[:,data_df.columns.isin(["model_name","graph_str_rep","doc_files"])]
text_models_data = text_models_data.drop_duplicates()
text_models_data.to_csv("data_text.csv",index=True)
text_models_data

## Exploratory analysis

In [None]:
df_num = df.loc[:, ~df.columns.isin(["is_parsed","is_sys_design"])]
df_num.head()

In [None]:
df_num.isnull().sum()

### Filtering by num of componnents >= 3 

In [None]:
df_num = df_num[(df_num["no_components"] >= 3)]
df_num

### Droping duplicates with build-in and with custom similarity function

In [None]:
df_num.drop_duplicates(inplace=True)
df_num

## Visualization, metrics and statistics over the data

In [None]:
df_data = df_num

In [None]:
# Missing values
plt.title("Missing Values", fontsize=12)
ms.bar(df_data,fontsize=8,figsize=(10,3))

## Distribution for numerical metrics

### Distributions for amount of component, connection and size of every model

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20,6))
fig.suptitle('Distribution of # components, # connections and size',fontsize=12)
axes[0].set_title('# components:',fontsize=10)
axes[1].set_title('# connections:',fontsize=10)
# axes[2].set_title('# size:',fontsize=10)
sns.histplot(ax=axes[0],data=df_data['no_components'],kde=True);
sns.histplot(ax=axes[1],data=df_data['no_connectors'],kde=True);
axes[0].set_ylim([0, 100])
axes[1].set_ylim([0, 200])
# sns.histplot(ax=axes[2],data=df_num['size'],kde=True);


### Description for the size parametter

In [None]:
plt.figure(figsize=(8,4))
plt.grid()
sns.lineplot(data=df_data[["no_size"]].describe().drop("count",axis=0))

In [None]:
df_data[["no_size"]].describe()

In [None]:
small_size = df_data[(df_num["no_size"] >= 1) & (df_data["no_size"] <= 13) ]
(x,_) = small_size.shape
print("percent: ", ((x) / df_data.shape[0])*100)
small_size

### Distributions for amount of category of the components

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20,5))
fig.suptitle('Distribution of # category types: ["no_hardware_comp","no_software_comp"]',fontsize=12)
axes[0].set_title('# no_hardware_comp:',fontsize=10)
axes[1].set_title('# no_software_comp:',fontsize=10)
sns.histplot(ax=axes[0],data=df_data['no_hardware_comp'],kde=True,color='orange');
sns.histplot(ax=axes[1],data=df_data['no_software_comp'],kde=True,color='orange');
axes[0].set_ylim([0, 100])
axes[1].set_ylim([0, 200])

### Description for no_hardware_comp and no_software_comp

In [None]:
df_data[["no_hardware_comp"]].describe()

In [None]:
df_data[["no_software_comp"]].describe()

In [None]:
df_data[df_data["no_software_comp"]>= 5]

In [None]:
df_data[df_data["no_hardware_comp"] >= 5]

In [None]:
df_data[["no_data_comp"]].describe()

In [None]:
df_data[df_data["no_data_comp"] >= 5].head(10)

In [None]:
df_data[df_data["no_sys_comp"] >= 5]

In [None]:
plt.figure(figsize=(8,4))
plt.grid()
sns.lineplot(data=df_data[["no_hardware_comp","no_software_comp"]].describe().drop("count",axis=0))

### Distributions for understandability, cohesion, coupling

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(20,5))
fig.suptitle('Distribution of # category types: ["understandability","cohesion","coupling","complexity"]',fontsize=12)
axes[0].set_title('# understandability:',fontsize=10)
axes[1].set_title('# cohesion:',fontsize=10)
axes[2].set_title('# coupling:',fontsize=10)
axes[3].set_title('# complexity:',fontsize=10)
sns.histplot(ax=axes[0],data=df_data['understandability'],kde=True,color='green');
sns.histplot(ax=axes[1],data=df_data['cohesion'],kde=True,color='green');
sns.histplot(ax=axes[2],data=df_data['coupling'],kde=True,color='green');
sns.histplot(ax=axes[3],data=df_data['complexity'],kde=True,color='green');
axes[0].set_ylim([0, 100])
axes[1].set_ylim([0, 200])
axes[2].set_ylim([0, 200])
axes[3].set_ylim([0, 40])

### Description for understandability and cohesion

In [None]:
df_data["coupling"].describe()

In [None]:
df_data[(df_data["complexity"] >=6) & (df_data["complexity"] <= 8) ].head(10)

In [None]:
plt.figure(figsize=(8,4))
plt.grid()
sns.lineplot(data=df_data[["understandability","cohesion"]].describe().drop("count",axis=0))
plt.ylim([0, 5])

### Description for coupling

In [None]:
plt.figure(figsize=(8,4))
plt.grid()
sns.lineplot(data=df_data[["coupling"]].describe().drop("count",axis=0))
plt.ylim([0, 40])

### Distributions for graph metrics graph_density, avg_shortest_path, avg_clust_coeff, avg_deg_cent

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(20,5))
fig.suptitle('Distribution of # category types: ["graph_density","avg_shortest_path","avg_deg_cent"]',fontsize=12)
axes[0].set_title('# graph_density:',fontsize=10)
axes[1].set_title('# avg_shortest_path:',fontsize=10)
axes[2].set_title('# avg_deg_cent:',fontsize=10)
sns.histplot(ax=axes[0],data=df_data['graph_density'],kde=True,color='blue');
sns.histplot(ax=axes[1],data=df_data['avg_shortest_path'],kde=True,color='blue');
sns.histplot(ax=axes[2],data=df_data['avg_deg_cent'],kde=True,color='blue');
axes[0].set_ylim([0, 500])
axes[1].set_ylim([0, 500])
axes[2].set_ylim([0, 800])

### Description for avg_shortest_path, graph_density, avg_deg_cent, avg_clust_coeff

In [None]:
plt.figure(figsize=(8,4))
plt.grid()
sns.lineplot(data=df_data[["avg_shortest_path"]].describe().drop("count",axis=0))

In [None]:
plt.figure(figsize=(8,4))
plt.grid()
sns.lineplot(data=df_data[["graph_density"]].describe().drop("count",axis=0))

In [None]:
plt.figure(figsize=(8,4))
plt.grid()
sns.lineplot(data=df_data[["avg_deg_cent"]].describe().drop("count",axis=0))

## Trying to get reelation between variables

In [None]:
sns.pairplot(df_data[["no_size","cohesion","coupling","understandability","complexity","avg_shortest_path","avg_deg_cent","no_hardware_comp",
                             "no_software_comp"]])


### Correlation Matrix

In [None]:
plt.figure(figsize=(12,8))
df_num_corr =  df_data.loc[:, ~df_data.columns.isin(['model_name', "sys_name"])]
corrmat = df_num_corr.corr()
hm = sns.heatmap(corrmat, 
                 cbar=True, 
                 annot=True, 
                 square=True, 
                 fmt='.2f', 
                 annot_kws={'size': 8}, 
                 yticklabels=df_num_corr.columns, 
                 xticklabels=df_num_corr.columns, 
                 cmap="Spectral_r")
plt.show()

## Clustering

In [None]:
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import KMeans

In [None]:
X =  df_data.loc[:, ~df_data.columns.isin(['model_name', "sys_name"])].values

In [None]:
wcss = []
no_clusters = 4
for i in range(1, no_clusters):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

# Plot the WCSS versus the number of clusters
plt.plot(range(1, no_clusters), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.grid()
plt.show()

In [None]:
model = KMeans(n_clusters=2,n_init="auto")
model.fit(X)

In [None]:
# assign a cluster to each example
labels = model.predict(X)
clusters = unique(labels)
centroids = model.cluster_centers_
print(labels)
print(clusters)

In [None]:
for cluster in clusters:
    plt.scatter(df_data[labels == cluster]["no_size"],df_data[labels == cluster]["complexity"], label = cluster)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 10, color = 'k')
plt.legend()
plt.show()

In [None]:
for cluster in clusters:
    plt.scatter(df_data[labels == cluster]["no_hardware_comp"],df_data[labels == cluster]["no_software_comp"], label = cluster)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 10, color = 'k')
plt.legend()
plt.show()

In [None]:
for cluster in clusters:
    plt.scatter(df_data[labels == cluster]["coupling"],df_data[labels == cluster]["no_software_comp"], label = cluster)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 10, color = 'k')
plt.legend()
plt.show()

In [None]:
df_data[labels == 0]

In [None]:
df_data[labels == 1]

In [None]:
df_data[labels == 2]

In [None]:
df_data[labels == 3]