In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
# from matplotlib import cm

In [79]:
df=pd.read_csv('results_corr.csv')
df=df.drop(df.columns[0], axis=1)
df=df.dropna()
df.head()

Unnamed: 0,element_count,column_count,array_time,pandas_df_time,pandas_df_compute,spark_df_time,spark_df_compute
4,10,1,1.8e-05,9.3e-05,0.000277,0.021119,0.48181
5,100,10,1.1e-05,5.2e-05,0.00016,0.024482,0.442082
6,1000,100,2.7e-05,9.6e-05,0.000435,0.047411,0.559718
7,10000,1000,0.000164,9.1e-05,0.025667,0.208266,2.635492
8,100,1,2.5e-05,5.2e-05,0.000286,0.017645,0.342826


In [80]:
df['row_count']=df['element_count']/df['column_count']
# df.sort_values(by=['row_count', 'column_count'])
df.sort_values(by=['row_count', 'column_count'])

Unnamed: 0,element_count,column_count,array_time,pandas_df_time,pandas_df_compute,spark_df_time,spark_df_compute,row_count
4,10,1,1.8e-05,9.3e-05,0.000277,0.021119,0.48181,10.0
5,100,10,1.1e-05,5.2e-05,0.00016,0.024482,0.442082,10.0
6,1000,100,2.7e-05,9.6e-05,0.000435,0.047411,0.559718,10.0
7,10000,1000,0.000164,9.1e-05,0.025667,0.208266,2.635492,10.0
8,100,1,2.5e-05,5.2e-05,0.000286,0.017645,0.342826,100.0
9,1000,10,2.7e-05,8e-05,0.000196,0.024981,0.358848,100.0
10,10000,100,0.000149,8.3e-05,0.002368,0.07977,0.452794,100.0
11,100000,1000,0.001066,5.2e-05,0.209447,0.688059,2.490437,100.0
12,1000,1,1.9e-05,5.2e-05,0.00016,0.030016,0.338364,1000.0
13,10000,10,0.000105,5.4e-05,0.000412,0.077821,0.354641,1000.0


### 2D Stacked Line Charts

In [44]:
fig, ax=plt.subplots(1, 2, sharey=True)
for each_column_ct in df['column_count'].unique(): 
    sub_df=df[df['column_count']==each_column_ct]
    ax[0].plot(np.log(sub_df['row_count']), 
               np.log(sub_df['pandas_df_compute']), label=each_column_ct)
for each_column_ct in df['column_count'].unique(): 
    sub_df=df[df['column_count']==each_column_ct]
    ax[1].plot(np.log(sub_df['row_count']), 
             np.log(sub_df['spark_df_compute']), label=each_column_ct)
plt.legend()

<matplotlib.legend.Legend at 0x7ff4d5bb9b38>

### 3D Scatter Plots

In [81]:
# %matplotlib notebook

fig=plt.figure()
ax=plt.axes(projection='3d')
ax.scatter3D(np.log(df['row_count']),
             np.log(df['column_count']), 
             np.log(df['pandas_df_compute']), 
             c='tab:red')
ax.scatter3D(np.log(df['row_count']),
             np.log(df['column_count']), 
             np.log(df['spark_df_compute']), 
             c='tab:blue')
# ax.scatter3D(df['row_count'],
#              df['column_count'], 
#              df['pandas_df_compute'])
ax.set_xlabel('row_count')
ax.set_ylabel('column_count')
ax.set_zlabel('df_compute')
# plt.show()

Text(0.5, 0, 'df_compute')

### 3D Surface Plots

#### Model to Draw Plane - Pandas

In [83]:
pandas_poly=PolynomialFeatures(degree=2)
pandas_x=np.log(df[['row_count', 'column_count']])
pandas_compute=np.log(df['pandas_df_compute'])
pandas_poly.fit(pandas_x)
transformed_pandas_x=pandas_poly.transform(pandas_x)

pandas_lr_poly=LinearRegression()
pandas_lr_poly.fit(transformed_pandas_x, pandas_compute)
display(pandas_lr_poly.score(transformed_pandas_x, pandas_compute))

pandas_results_poly=pandas_lr_poly.predict(transformed_pandas_x)
pandas_results_df=pd.DataFrame({'y': pandas_compute, 
                         'results_poly': pandas_results_poly, 
                         'diff_poly': pandas_results_poly-pandas_compute})

0.9919412306340483

#### Meshgrid for Plane - Pandas

In [84]:
all_row_counts=df['row_count'].unique()
all_column_counts=df['column_count'].unique()
planes_count=10
x_1=np.linspace(np.log(all_row_counts.min()), np.log(all_row_counts.max()), planes_count)
x_2=np.linspace(np.log(all_column_counts.min()), np.log(all_column_counts.max()), planes_count)
X_1, X_2=np.meshgrid(x_1, x_2)
r1, r2=X_1.flatten(), X_2.flatten()
r1, r2=r1.reshape(-1, 1), r2.reshape(-1, 1)
grid=np.hstack([r1, r2])

transformed_grid=pandas_poly.transform(grid)
pandas_z=pandas_lr_poly.predict(transformed_grid)
pandas_Z=pandas_z.reshape(-1, planes_count)

#### Model to Draw Plane - Spark

In [86]:
spark_poly=PolynomialFeatures(degree=2)
spark_x=np.log(df[['row_count', 'column_count']])
spark_compute=np.log(df['spark_df_compute'])
spark_poly.fit(spark_x)
transformed_spark_x=spark_poly.transform(spark_x)

spark_lr_poly=LinearRegression()
spark_lr_poly.fit(transformed_spark_x, spark_compute)
display(spark_lr_poly.score(transformed_spark_x, spark_compute))

spark_results_poly=spark_lr_poly.predict(transformed_spark_x)
spark_results_df=pd.DataFrame({'y': spark_compute, 
                         'results_poly': spark_results_poly, 
                         'diff_poly': spark_results_poly-spark_compute})

0.9691618615469142

#### Meshgrid for Plane - Spark

In [87]:
all_row_counts=df['row_count'].unique()
all_column_counts=df['column_count'].unique()
planes_count=10
x_1=np.linspace(np.log(all_row_counts.min()), np.log(all_row_counts.max()), planes_count)
x_2=np.linspace(np.log(all_column_counts.min()), np.log(all_column_counts.max()), planes_count)
X_1, X_2=np.meshgrid(x_1, x_2)
r1, r2=X_1.flatten(), X_2.flatten()
r1, r2=r1.reshape(-1, 1), r2.reshape(-1, 1)
grid=np.hstack([r1, r2])

transformed_grid=spark_poly.transform(grid)
spark_z=spark_lr_poly.predict(transformed_grid)
spark_Z=spark_z.reshape(-1, planes_count)

#### Plot

In [88]:
%matplotlib

fig=plt.figure(figsize=(7.5, 5))
#====first plot====
# ax=fig.add_subplot(1, 2, 1, projection='3d')
ax=plt.axes(projection='3d')
ax.plot_wireframe(X_1,
                X_2, 
                spark_Z, 
                color='tab:blue', 
                label='Spark DataFrame')

# ax=fig.add_subplot(1, 2, 1, projection='3d')
ax.plot_wireframe(X_1,
                X_2, 
                pandas_Z, 
                color='tab:red', 
                label='Pandas DataFrame')

xlabels_exp=sorted(df['row_count'].unique())
xlabels_exp_red=[label for idx, label in enumerate(xlabels_exp) if idx%2==0]
xlabels=np.log(xlabels_exp_red)
ax.set_xlabel('row_count')
ax.set_xticks(xlabels)
ax.set_xticklabels(xlabels_exp_red)

ylabels_exp=sorted(df['column_count'].unique())
ylabels=np.log(ylabels_exp)
ax.set_ylabel('column_count')
ax.set_yticks(ylabels)
ax.set_yticklabels(ylabels_exp)

min_time=df[['spark_df_compute', 'pandas_df_compute']].min().min()
max_time=df[['spark_df_compute', 'pandas_df_compute']].max().max()
zlabels_log=np.linspace(np.log(min_time), np.log(max_time), 5)
zlabels=np.exp(zlabels_log)
ax.set_zlabel('compute_time (s)')
ax.set_zticks(zlabels_log)
ax.set_zticklabels(zlabels.round(2))

ax.set_title('DataFrame Mean Computation Time (s)', pad=25)
col1_patch = mpatches.Patch(color='tab:blue', label='Spark DataFrame')
col2_patch = mpatches.Patch(color='tab:red', label='Pandas DataFrame')
plt.legend(handles=[col1_patch, col2_patch])

#====second plot====
# ax=plt.fig_subplot(1, 2, 1, projection='3d')
# ax.plot_surface(X_1,
#                 X_2, 
#                 pandas_Z, 
#                 color='tab:red', 
#                 label='Pandas DataFrame')

# xlabels_exp=sorted(df['row_count'].unique())
# xlabels_exp_red=[label for idx, label in enumerate(xlabels_exp) if idx%2==0]
# xlabels=np.log(xlabels_exp_red)
# ax.set_xlabel('row_count')
# ax.set_xticks(xlabels)
# ax.set_xticklabels(xlabels_exp_red)

# ylabels_exp=sorted(df['column_count'].unique())
# ylabels=np.log(ylabels_exp)
# ax.set_ylabel('column_count')
# ax.set_yticks(ylabels)
# ax.set_yticklabels(ylabels_exp)

# min_time=df[['spark_df_compute', 'pandas_df_compute']].min().min()
# max_time=df[['spark_df_compute', 'pandas_df_compute']].max().max()
# zlabels_log=np.linspace(np.log(min_time), np.log(max_time), 5)
# zlabels=np.exp(zlabels_log)
# ax.set_zlabel('compute_time (s)')
# ax.set_zticks(zlabels_log)
# ax.set_zticklabels(zlabels.round(2))

# ax.set_title('DataFrame Count Computation Time (s)', pad=25)
# col1_patch = mpatches.Patch(color='tab:blue', label='Spark DataFrame')
# col2_patch = mpatches.Patch(color='tab:red', label='Pandas DataFrame')
# plt.legend(handles=[col1_patch, col2_patch])

# for ii in range(0,360,1):
#     if ii>=180 and ii<=270 and ii%3==0: 
#         ax.view_init(elev=10., azim=ii)
#         plt.savefig("movie/movie%d.png" % ii)
#     elif (ii<=180 or ii>=270) and ii%10==0: 
#         ax.view_init(elev=10., azim=ii)
#         plt.savefig("movie/movie%d.png" % ii)

#240 puts 0, 0 on left
ax.view_init(elev=5, azim=240)
plt.show()

Using matplotlib backend: Qt5Agg


In [7]:
df[['element_count']].corr()

Unnamed: 0,element_count
element_count,1.0
