# Analysis

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

#### Load Correlation

In [2]:
corr_df=pd.read_csv('results_corr.csv')
corr_df=corr_df.drop(corr_df.columns[0], axis=1)
corr_df=corr_df.dropna()
corr_df['row_count']=corr_df['element_count']/corr_df['column_count']
corr_df.sort_values(by=['row_count', 'column_count'])
corr_df.head()

Unnamed: 0,element_count,column_count,array_time,pandas_df_time,pandas_df_compute,spark_df_time,spark_df_compute,row_count
4,10,1,1.8e-05,9.3e-05,0.000277,0.021119,0.48181,10.0
5,100,10,1.1e-05,5.2e-05,0.00016,0.024482,0.442082,10.0
6,1000,100,2.7e-05,9.6e-05,0.000435,0.047411,0.559718,10.0
7,10000,1000,0.000164,9.1e-05,0.025667,0.208266,2.635492,10.0
8,100,1,2.5e-05,5.2e-05,0.000286,0.017645,0.342826,100.0


#### Load Mean

In [3]:
mean_df=pd.read_csv('results_mean.csv')
mean_df=mean_df.drop(mean_df.columns[0], axis=1)
mean_df=mean_df.dropna()
mean_df['row_count']=mean_df['element_count']/mean_df['column_count']
mean_df.sort_values(by=['row_count', 'column_count'])
mean_df.head()

Unnamed: 0,array_time,column_count,element_count,pandas_df_compute,pandas_df_time,spark_df_compute,spark_df_time,row_count
0,1e-05,1,1,0.000111,5.2e-05,0.130022,0.013988,1.0
1,2e-05,10,10,0.000255,9.4e-05,0.152254,0.019682,1.0
2,1.8e-05,100,100,0.000644,0.000102,0.850829,0.033399,1.0
3,2e-05,1000,1000,0.00215,5.3e-05,7.937093,0.154767,1.0
4,1e-05,1,10,0.000107,5.2e-05,0.098161,0.012703,10.0


### 2D Stacked Line Charts

In [13]:
df=corr_df
fig, ax=plt.subplots(1, 2, sharey=True)
for each_column_ct in df['column_count'].unique(): 
    sub_df=df[df['column_count']==each_column_ct]
    ax[0].plot(np.log(sub_df['row_count']), 
               np.log(sub_df['pandas_df_compute']), label=each_column_ct)
for each_column_ct in df['column_count'].unique(): 
    sub_df=df[df['column_count']==each_column_ct]
    ax[1].plot(np.log(sub_df['row_count']), 
             np.log(sub_df['spark_df_compute']), label=each_column_ct)
plt.legend()

<matplotlib.legend.Legend at 0x7fa763a6c6a0>

### 3D Scatter Plots

In [18]:
# %matplotlib notebook
df=mean_df
fig=plt.figure()
ax=plt.axes(projection='3d')
ax.scatter3D(np.log(df['row_count']),
             np.log(df['column_count']), 
             np.log(df['pandas_df_compute']), 
             c='tab:red')
ax.scatter3D(np.log(df['row_count']),
             np.log(df['column_count']), 
             np.log(df['spark_df_compute']), 
             c='tab:blue')

all_row_counts=df['row_count'].unique()
all_column_counts=df['column_count'].unique()
xlabels_exp=sorted(all_row_counts)
xlabels_exp_red=[label for idx, label in enumerate(xlabels_exp) if idx%2==0]
xticks_locs=np.log(xlabels_exp_red)
xticks_labels=xlabels_exp_red
ylabels_exp=sorted(all_column_counts)
yticks_locs=np.log(ylabels_exp)
yticks_labels=ylabels_exp
min_time=df[['spark_df_compute', 'pandas_df_compute']].min().min()
max_time=df[['spark_df_compute', 'pandas_df_compute']].max().max()
zlabels_log=np.linspace(np.log(min_time), np.log(max_time), 5)
zticks_locs=zlabels_log
zticks_labels=np.exp(zlabels_log).round(2)

ax.set_xlabel('row_count')
ax.set_xticks(xticks_locs)
ax.set_xticklabels(xticks_labels)
ax.set_ylabel('column_count')
ax.set_yticks(yticks_locs)
ax.set_yticklabels(yticks_labels)
ax.set_zlabel('compute_time (s)')
ax.set_zticks(zticks_locs)
ax.set_zticklabels(zticks_labels)
ax.set_title('DataFrame Mean Computation Time (s)', pad=25)

plt.show()

### 3D Surface Plots

#### 1) Model to Draw Plane - Pandas & Spark
#### 2) Meshgrid for Plane - Pandas & Spark

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

def get_plane(input_df, degree, planes_count): 
    # get grid projection plane
    poly=PolynomialFeatures(degree=degree)
    x=np.log(input_df[['row_count', 'column_count']])
    transformed_x=poly.fit_transform(x)
    all_row_counts=input_df['row_count'].unique()
    all_column_counts=input_df['column_count'].unique()
    x_1=np.linspace(np.log(all_row_counts.min()), np.log(all_row_counts.max()), planes_count)
    x_2=np.linspace(np.log(all_column_counts.min()), np.log(all_column_counts.max()), planes_count)
    X_1, X_2=np.meshgrid(x_1, x_2)
    r1, r2=X_1.flatten(), X_2.flatten()
    r1, r2=r1.reshape(-1, 1), r2.reshape(-1, 1)
    grid=np.hstack([r1, r2])
    transformed_grid=poly.transform(grid)

    # fit regression for Pandas compute
    pandas_compute=np.log(input_df['pandas_df_compute'])
    pandas_lr_poly=LinearRegression()
    pandas_lr_poly.fit(transformed_x, pandas_compute)
    print(f'Pandas R^2 Score: {pandas_lr_poly.score(transformed_x, pandas_compute)}')
    
    # infer plane for Pandas compute
    pandas_results_poly=pandas_lr_poly.predict(transformed_x)
    pandas_z=pandas_lr_poly.predict(transformed_grid)
    pandas_Z=pandas_z.reshape(-1, planes_count)
    
    # get regression for Spark compute
    spark_compute=np.log(input_df['spark_df_compute'])
    spark_lr_poly=LinearRegression()
    spark_lr_poly.fit(transformed_x, spark_compute)
    print(f'Spark R^2 Score: {spark_lr_poly.score(transformed_x, spark_compute)}')

    # infer plane for Spark compute
    spark_results_poly=spark_lr_poly.predict(transformed_x)
    spark_z=spark_lr_poly.predict(transformed_grid)
    spark_Z=spark_z.reshape(-1, planes_count)
    
    # get X, Y, Z labels 
    xlabels_exp=sorted(all_row_counts)
    xlabels_exp_red=[label for idx, label in enumerate(xlabels_exp) if idx%2==0]
    xticks_locs=np.log(xlabels_exp_red)
    xticks_labels=xlabels_exp_red
    ylabels_exp=sorted(all_column_counts)
    yticks_locs=np.log(ylabels_exp)
    yticks_labels=ylabels_exp
    min_time=input_df[['spark_df_compute', 'pandas_df_compute']].min().min()
    max_time=input_df[['spark_df_compute', 'pandas_df_compute']].max().max()
    zlabels_log=np.linspace(np.log(min_time), np.log(max_time), 5)
    zticks_locs=zlabels_log
    zticks_labels=np.exp(zlabels_log).round(2)

    return X_1, X_2, pandas_Z, spark_Z, xticks_locs, xticks_labels, yticks_locs, yticks_labels, zticks_locs, zticks_labels

#### Plot

In [7]:
%matplotlib

fig=plt.figure(figsize=(10, 5))
#====first plot====
ax_corr=fig.add_subplot(1, 2, 1, projection='3d')
X_1, X_2, pandas_Z, spark_Z, xticks_locs, xticks_labels, yticks_locs, yticks_labels, zticks_locs, zticks_labels=get_plane(corr_df, 2, 20)

ax_corr.plot_wireframe(X_1,
                X_2, 
                spark_Z, 
                color='tab:blue', 
                label='Spark DataFrame')
ax_corr.plot_wireframe(X_1,
                X_2, 
                pandas_Z, 
                color='tab:red', 
                label='Pandas DataFrame')

ax_corr.set_xlabel('row_count')
ax_corr.set_xticks(xticks_locs)
ax_corr.set_xticklabels(xticks_labels)
ax_corr.set_ylabel('column_count')
ax_corr.set_yticks(yticks_locs)
ax_corr.set_yticklabels(yticks_labels)
ax_corr.set_zlabel('compute_time (s)')
ax_corr.set_zticks(zticks_locs)
ax_corr.set_zticklabels(zticks_labels)
ax_corr.set_title('DataFrame Corr Computation Time (s)', pad=25)

#====second plot====
ax_mean=fig.add_subplot(1, 2, 2, projection='3d')
X_1, X_2, pandas_Z, spark_Z, xticks_locs, xticks_labels, yticks_locs, yticks_labels, zticks_locs, zticks_labels=get_plane(mean_df, 2, 20)

ax_mean.plot_wireframe(X_1,
                X_2, 
                spark_Z, 
                color='tab:blue', 
                label='Spark DataFrame')
ax_mean.plot_wireframe(X_1,
                X_2, 
                pandas_Z, 
                color='tab:red', 
                label='Pandas DataFrame')

ax_mean.set_xlabel('row_count')
ax_mean.set_xticks(xticks_locs)
ax_mean.set_xticklabels(xticks_labels)
ax_mean.set_ylabel('column_count')
ax_mean.set_yticks(yticks_locs)
ax_mean.set_yticklabels(yticks_labels)
ax_mean.set_zlabel('compute_time (s)')
ax_mean.set_zticks(zticks_locs)
ax_mean.set_zticklabels(zticks_labels)
ax_mean.set_title('DataFrame Mean Computation Time (s)', pad=25)

col1_patch = mpatches.Patch(color='tab:blue', label='Spark DataFrame')
col2_patch = mpatches.Patch(color='tab:red', label='Pandas DataFrame')
plt.legend(handles=[col1_patch, col2_patch])

# # create movie
# for ii in range(0,360,1):
#     if ii>=180 and ii<=270 and ii%3==0: 
#         ax_corr.view_init(elev=10., azim=ii)
#         ax_mean.view_init(elev=10., azim=ii)
#         plt.savefig("movie/movie%d.png" % ii)
#     elif (ii<=180 or ii>=270) and ii%10==0: 
#         ax_corr.view_init(elev=10., azim=ii)
#         ax_mean.view_init(elev=10., azim=ii)
#         plt.savefig("movie/movie%d.png" % ii)

#240 puts 0, 0 on left
# ax_corr.view_init(elev=5, azim=240)
plt.show()

Using matplotlib backend: Qt5Agg
Pandas R^2 Score: 0.9919412306340483
Spark R^2 Score: 0.9691618615469142
Pandas R^2 Score: 0.9483225634815378
Spark R^2 Score: 0.9624447516279816


In [12]:
mean_df[['row_count', 'column_count', 'pandas_df_compute', 'spark_df_compute']].tail()

Unnamed: 0,row_count,column_count,pandas_df_compute,spark_df_compute
21,100000.0,10,0.003276,0.28118
22,100000.0,100,0.056344,2.433687
23,1000000.0,1,0.001849,0.898498
24,1000000.0,10,0.066983,1.903459
25,10000000.0,1,0.017181,7.524165


In [None]:
ax_corr.view_init()