# Libraries 


In [27]:
#! pip install pandas
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import numpy as np


In [2]:
import pandas as pd
import glob
import os

path = '/content/Data' # use your path
all_files = glob.glob(os.path.join(path, "*.csv"))

df = pd.concat((pd.read_csv(f) for f in all_files[::-1]), ignore_index=True)
df.drop(columns=['Unnamed: 0'], inplace= True)

# Basic Insights

In [3]:
df.shape


(10220, 5)

In [4]:
df.tail()

Unnamed: 0,m,n,kernel,n_iter,training_time
10215,1000,9,0,5708,0.042053
10216,1000,6,1,981,0.045363
10217,1000,6,0,5436,0.05162
10218,1000,3,1,758,0.047805
10219,1000,3,0,1173,0.039243


In [5]:
df.head()

Unnamed: 0,m,n,kernel,n_iter,training_time
0,5100,30,1,4328,1.168847
1,5100,30,0,162120,2.086717
2,5100,27,1,4452,1.143359
3,5100,27,0,68016,1.895257
4,5100,24,1,4554,1.130787


In [6]:
df.describe()

Unnamed: 0,m,n,kernel,n_iter,training_time
count,10220.0,10220.0,10220.0,10220.0,10220.0
mean,2551.956947,16.5,0.5,19689.313699,0.421133
std,1472.475751,8.617266,0.500024,31710.968663,0.404353
min,10.0,3.0,0.0,5.0,0.000524
25%,1270.0,9.0,0.0,1856.75,0.07399
50%,2550.0,16.5,0.5,3618.0,0.315645
75%,3830.0,24.0,1.0,24859.25,0.672307
max,5100.0,30.0,1.0,250789.0,2.119709




1.   m, n and kernel are exactly how we want them to be.
2.   something weird is in the n_iter = 
      <br>
      the range of values is very wide.
3.   training_time = 
      <br>
      







In [7]:
#calcuating VIFs
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

X = df.drop(columns=['training_time'])
junk = []
x = add_constant(X)
vif0 = pd.Series([variance_inflation_factor(x.values, i)  for i in range(x.shape[1])], index=x.columns)
vif = pd.Series([variance_inflation_factor(x.values, i)  for i in range(x.shape[1])], index=x.columns)
print(vif)

const     8.690205
m         1.145194
n         1.422166
kernel    1.691301
n_iter    2.258661
dtype: float64


  x = pd.concat(x[::order], 1)


In [8]:
px.colors.sequential.Tealgrn

['rgb(176, 242, 188)',
 'rgb(137, 232, 172)',
 'rgb(103, 219, 165)',
 'rgb(76, 200, 163)',
 'rgb(56, 178, 163)',
 'rgb(44, 152, 160)',
 'rgb(37, 125, 152)']

In [9]:
class ImagesBank:
    def __init__(self, dataframe) -> None:
        self.df = dataframe
        self.scatter = None
        self.displot = None
        self.boxplot = None
        self.bubble = None

    def draw_bubble(self, x: str, y: str, color: str, log_x=False, title= None):
        fig = px.scatter(self.df, x=x, y=y, color=color, log_x=False, size_max=6, color_continuous_scale='tealgrn')
        self.layout(fig)
        self.bubble = fig
        return self.bubble
    

    def draw_box(self, y: str, color = px.colors.sequential.Tealgrn[3:]):
        fig = px.box(self.df, y=y, color_discrete_sequence=color)
        self.layout(fig)
        self.boxplot = fig
        return self.boxplot


    def draw_dist(self, column: str, binsize = 0.001, curve= 'normal'):
        hist_data = [self.df[column]]
        label = [column]
        fig = ff.create_distplot(hist_data, label, bin_size=binsize, curve_type=curve, colors=px.colors.sequential.Tealgrn[3:])
        self.layout(fig)
        self.displot = fig
        return self.displot

    def layout(self, figure):
        figure.update_layout(
                            title_x=0.16,
                            width=600,
                            height=600,
                            legend=dict(
                                yanchor="top",
                                y=1.1,
                                xanchor="left",
                                x=0.01),
                            template='plotly_white',
                            plot_bgcolor="rgb(255,255,255)",
                            font=dict(family="Courier New, monospace",
                                      size=14))
    def add_titles(self, figure, title=None, x_title=None, y_title=None):
        figure.update_layout(title_text=title,
                            xaxis_title=x_title,
                            yaxis_title=y_title)
        return figure

In [10]:
df_whole = ImagesBank(df)
df_whole.draw_bubble(x="m", y="training_time", color="n")
df_whole.add_titles(df_whole.bubble, 'Bubble Plot', x_title='m', y_title='training time').show()

**Interpretation :**

2 trends appear, most probabily these are the kernels.

For both kernels, when m is low, different n values have similar training_time. However, when m is high (5000), different values of n have different training time. 




In [12]:
df_whole.draw_box('n_iter',px.colors.sequential.Tealgrn[5:])

In [11]:
df_whole.draw_box('training_time')

In [13]:
df_whole.draw_dist(column='training_time', curve='kde')
df_whole.add_titles(df_whole.displot, 'Distribution of Training Time', x_title='possible values', y_title='density/frequency').show()

## Compare Kernels


In [14]:
kernel0 = ImagesBank(df[df.kernel == 0])
kernel1 = ImagesBank(df[df.kernel == 1])

In [15]:
kernel0.draw_bubble(x="m", y="training_time", color="n")
kernel0.add_titles(kernel0.bubble, 'Bubble Plot-linear kernel', x_title='m', y_title='training time').show()

In [16]:
kernel1.draw_bubble(x="m", y="training_time", color="n")
kernel1.add_titles(kernel1.bubble, 'Bubble Plot-rbf kernel', x_title='m', y_title='training time').show()

In [17]:
kernel0.draw_box('training_time')

In [18]:
kernel1.draw_box('training_time')

In [19]:
kernel0.draw_box('n_iter',px.colors.sequential.Tealgrn[5:])

In [20]:
kernel1.draw_box('n_iter',px.colors.sequential.Tealgrn[5:])

In [25]:
kernel1.draw_dist('training_time', binsize=0.1, curve='kde')

In [29]:
# base 10 on 'training_time' column
kernel1.df['training_time_base10'] = np.log10(kernel1.df['training_time'])

In [33]:
# base 2 on 'training_time' column
kernel1.df['training_time_base2'] = np.log2(kernel1.df['training_time'])

In [41]:
# square root on 'training_time' column
kernel1.df['training_time_square_root'] = np.power(kernel1.df['training_time'], 1/2)

In [42]:
# cube root on 'training_time' column
kernel1.df['training_time_cube_root'] = np.power(kernel1.df['training_time'], 1/3)

In [43]:
kernel1.df.describe()

Unnamed: 0,m,n,kernel,n_iter,training_time,training_time_base10,training_time_base2,training_time_square,training_time_cube,training_time_square_root,training_time_cube_root
count,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0
mean,2551.956947,16.5,1.0,2146.615851,0.36555,-0.789504,-2.622676,0.240396,0.1893908,0.524961,0.621426
std,1472.547803,8.617687,0.0,1217.860586,0.326788,0.741812,2.464246,0.3252656,0.3249012,0.299973,0.264986
min,10.0,3.0,1.0,5.0,0.000524,-3.280789,-10.898544,2.744243e-07,1.437586e-10,0.022888,0.080613
25%,1270.0,9.0,1.0,1098.25,0.065607,-1.18305,-3.930006,0.004304286,0.0002823918,0.256139,0.40332
50%,2550.0,16.5,1.0,2153.0,0.295285,-0.529758,-1.759819,0.0871934,0.02574693,0.543402,0.665908
75%,3830.0,24.0,1.0,3174.5,0.595206,-0.225332,-0.748538,0.3542707,0.2108642,0.771496,0.841181
max,5100.0,30.0,1.0,4691.0,1.196411,0.07788,0.258712,1.431398,1.71254,1.093806,1.061598


In [44]:
kernel1.draw_dist('training_time_base10', binsize=0.1, curve='kde')

In [45]:
kernel1.draw_dist('training_time_base2', binsize=0.1, curve='kde')

In [50]:
kernel1.draw_dist('training_time_square_root', binsize=0.01, curve='kde')

In [51]:
kernel1.draw_dist('training_time_cube_root', binsize=0.01, curve='kde')

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from imblearn.pipeline import Pipeline

def log_transform(x):
    print(x)
    return np.log(x + 1)


scaler = StandardScaler()
transformer = FunctionTransformer(log_transform)
pipe = Pipeline(steps=[('scaler', scaler), ('transformer', transformer), ('regressor', your_regressor)], memory='sklearn_tmp_memory')

pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

In [None]:
def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    return df_out

In [None]:
k1c = remove_outlier(kernel1, col_name='training_time')
k0c = remove_outlier(kernel0, col_name='training_time')

In [None]:
data_no_outlier = pd.concat([k1c,k0c])

In [None]:
data_no_outlier.describe()

In [None]:
mu, sigma = 0, 0.1
s = np.random.normal(mu, sigma, size=(1000)) 
import matplotlib.pyplot as plt
count, bins, ignored = plt.hist(s, 30, density=True)
plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) *
               np.exp( - (bins - mu)**2 / (2 * sigma**2) ),
         linewidth=2, color='r')
plt.show()

In [None]:
mu=0.0
std = 0.90 * np.std(s) # for %5 Gaussian noise
def gaussian_noise(x,mu,std):
    noise = np.random.normal(mu, std, size = x.shape)
    x_noisy = x + noise
    return x_noisy

x_noisy = gaussian_noise(s, mu=mu, std=std) 

count, bins, ignored = plt.hist(x_noisy, 30, density=True)
plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) *
               np.exp( - (bins - mu)**2 / (2 * sigma**2) ),
         linewidth=2, color='r')
plt.show()