In [1]:
#imports
import pandas as pd
from datetime import datetime
import functions as fx
import numpy as np
import researchpy as rp
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.diagnostic import het_white

import inspect
from collections import Counter

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')

module://matplotlib_inline.backend_inline


In [2]:
sessions = pd.read_csv("export_student_sessions.csv", delimiter=';')
switched = pd.read_csv("export_level_switched_log.csv", delimiter=';')
students = pd.read_csv("export_students.csv", delimiter=';')
users = pd.read_csv("export_users.csv", delimiter=';')
students_filtered = pd.read_csv("students_filtered.csv", delimiter=',')

## Put the sessions of the students in the timeframe in one dataframe 

In [3]:
def str_to_datetime(string):
    return datetime.strptime(string, '%Y-%m-%d %H:%M:%S')


sessions
sessions_2 = pd.DataFrame(columns=['timestamp','id','interval','stars','score','clippy','block_try_counter','student_id','datetime'])
participating_students = sessions[sessions['student_id'].isin(students_filtered['user_id'])]

for stu_id in participating_students['student_id'].unique():
    # take mean of all sessions on same date and time
    sessions_student_x = participating_students[participating_students['student_id']==stu_id].groupby('timestamp').mean()
    # .groupby() takes away the student id, get it back. 
    sessions_student_x['student_id'] = stu_id
    #make the index a column in the dataframe
    sessions_student_x.reset_index(level=0, inplace=True)
    #give a datetime variable
    sessions_student_x['datetime'] = sessions_student_x['timestamp'].apply(str_to_datetime)
    
    sessions_2 = pd.merge(sessions_2,
                    sessions_student_x,
                    how='outer')
    
cutoff = str_to_datetime("2021-10-20 13:00:00")
cutoff

# Keep only rows with a datetime after the start of the experiment (2021-10-20 13:00:00)
sessions_to_analyze = sessions_2[sessions_2['datetime']>cutoff]
print(sessions_2.shape, sessions_to_analyze.shape)

# sessions_to_analyze
print(sessions_to_analyze.shape)

sessions_to_analyze.to_csv("sessions_to_analyze.csv", sep=';')


sessions_to_analyze

(671, 11) (325, 11)
(325, 11)


Unnamed: 0,timestamp,id,interval,stars,score,clippy,block_try_counter,created_at,updated_at,student_id,datetime
1,2021-10-21 10:28:24,35457.738095,41.124648,0.001942,0.611454,0.000000,1.162308,,,227b548a-4e80-4333-9fd8-f783c577475e,2021-10-21 10:28:24
8,2021-10-21 08:17:55,34250.678788,44.721962,3.000000,0.833722,0.000000,1.000000,,,ace9664f-f79c-411d-87d3-4aeaf3962ffb,2021-10-21 08:17:55
9,2021-11-03 18:40:42,131089.000000,103.352941,0.000000,0.750000,0.000000,1.000000,,,ace9664f-f79c-411d-87d3-4aeaf3962ffb,2021-11-03 18:40:42
10,2021-11-04 09:22:00,150719.172619,83.993860,0.000000,0.558061,0.000000,1.000000,,,ace9664f-f79c-411d-87d3-4aeaf3962ffb,2021-11-04 09:22:00
13,2021-11-03 20:50:29,141693.594595,12.076923,2.000000,0.497222,0.000000,1.153846,,,e2179d2e-8989-4f3d-943b-1c7435a6daa6,2021-11-03 20:50:29
...,...,...,...,...,...,...,...,...,...,...,...
666,2021-11-12 13:11:26,203822.000000,58.570261,2.000000,0.612302,0.041667,1.000000,,,a6bda0a3-f162-47d1-9ec7-ef697a014278,2021-11-12 13:11:26
667,2021-11-12 13:26:56,206940.553366,36.853405,0.016260,0.668499,0.039737,1.000000,,,a6bda0a3-f162-47d1-9ec7-ef697a014278,2021-11-12 13:26:56
668,2021-11-12 13:14:13,203826.500000,77.342857,1.000000,0.542604,0.000000,1.171429,,,838c4e69-f68b-47d3-aa07-48e2f4b3f721,2021-11-12 13:14:13
669,2021-11-12 13:35:38,206948.575480,54.757823,0.500000,0.677629,0.000000,1.059957,,,838c4e69-f68b-47d3-aa07-48e2f4b3f721,2021-11-12 13:35:38


# Inspect the data

In [4]:
print("There are {} students in the data that had at least 1 session during the experimental period".format(sessions_to_analyze['student_id'].nunique()))

There are 188 students in the data that had at least 1 session during the experimental period


In [5]:
def txt_to_lvl(txt):
    return {'GYMNASIUM':6, 'HAVO':4, 'HAVO_VWO':5, 'VMBOK':2, 'VMBO_HAVO':3}[txt]

switched.head()
# the initialization is also already in this dataframe
# Drop all the initial switches and keep only switches that happened during the experiment (non-startup phase)
real_switched = switched[switched['from_level'].isin(list(switched['from_level'].value_counts().keys()))].reset_index()

real_switched["from_no"] = real_switched['from_level'].map(txt_to_lvl)
real_switched["to_no"] = real_switched['to_level'].map(txt_to_lvl)

# Indicating if the switch was up or down
updown = []
for row in real_switched.iterrows():
    if txt_to_lvl(row[1]['from_level'])<txt_to_lvl(row[1]['to_level']):
        switch = 1
    elif txt_to_lvl(row[1]['from_level'])>txt_to_lvl(row[1]['to_level']):
        switch = -1
    else:
        print("ERRROR")
    updown.append(switch)
    
# The lowest 'from_no'  is the initial level of the student
initial_lvl = {uid:real_switched[real_switched['user_id']==uid]['from_no'].min() for uid in list(set(real_switched['user_id']))}

real_switched['up_down'] = updown
real_switched.head()


Unnamed: 0,index,id,user_id,from_level,to_level,created_at,updated_at,from_no,to_no,up_down
0,146,147,e5a00790-10d0-4e79-ba66-d9e8836e12ac,VMBO_HAVO,HAVO,,,3,4,1
1,147,148,ff0bfe14-f346-4af8-958a-0debdd764cf4,VMBOK,VMBO_HAVO,,,2,3,1
2,148,149,c3077517-cd98-4b76-801c-eb63ea574305,VMBOK,VMBO_HAVO,,,2,3,1
3,149,150,142f73ec-66b6-461f-83fe-948e4ed37bc7,VMBOK,VMBO_HAVO,,,2,3,1
4,150,151,10d31ad6-94e2-4115-92b8-f399cd7497b7,VMBOK,VMBO_HAVO,,,2,3,1


In [6]:
for sid in switched_ids:
    student = real_switched[real_switched['user_id']==sid].reset_index()
    if len(student) > 6:
        student['from_no'].plot()

    plt.title("Level of Switched Students")
    plt.ylim(0,7)
    plt.xlabel('Number of Switch')
    plt.ylabel('Level of Difficulty')
plt.grid(which='major')    
# plt.legend()
plt.savefig('students_switching_levels.png')

NameError: name 'switched_ids' is not defined

In [None]:
for sid in switched_ids:
    student = real_switched[real_switched['user_id']==sid].reset_index()
    if len(student) == 6:
        student['from_no'].plot()

    plt.title("Calculation of slopes (grey lines are sessions)")
    plt.ylim(0,7)
    plt.xlabel('Number of Switch')
    plt.ylabel('Level of Difficulty')
    plt.vlines(x=[i*(6/8) for i in range(7)], ymin=0, ymax=7, linestyles='dashed', colors='grey')
plt.grid(which='major')    
# plt.legend()
plt.savefig('explain_slope_without_paint.png')

In [None]:
# I need a counter of the i-th session after a switch 
# and what the level is relative to the initial level
sessions = sessions_to_analyze.sort_values(by=['student_id', "timestamp"]).reset_index()
sessions

# Now i am going to make a counter of the I-th session of a student
sess_no = 1
sess_list = [1]

for idx in range(1,len(sessions['student_id'])):
    if sessions['student_id'][idx-1]==sessions['student_id'][idx]:
        sess_no +=1
    elif sessions['student_id'][idx-1]!=sessions['student_id'][idx]:
        sess_no = 1
    sess_list.append(sess_no)   
sessions['session_number'] = sess_list

sessions

In [None]:
# In this cell i need to make the switched counters
sessions[sessions['student_id']=='00cdcdce-737d-4511-b48b-69c0102b7b37']

In [None]:
# Splitting up in control/experiment and switched/not_switched

# student IDs from the control group (=1):
control_ids = list(students_filtered[students_filtered['use_adaptive_academic_level']==1]['user_id'])
# student IDs from the experimental group (=2):
experimental_ids = list(students_filtered[students_filtered['use_adaptive_academic_level']==2]['user_id'])
# student IDs from the students who switched:
switched_ids = list(real_switched['user_id'].unique())
# student IDs from the students who did not switch:
not_switched_ids = control_ids+(list(set(experimental_ids)-set(switched_ids)))

control_df = sessions[sessions['student_id'].isin(control_ids)]
experimental_df = sessions[sessions['student_id'].isin(experimental_ids)]

switched_df = sessions[sessions['student_id'].isin(switched_ids)]
not_switched_df = sessions[sessions['student_id'].isin(not_switched_ids)]


In [None]:
# both partitions contain all the 325 sessions
print(len(control_df)+len(experimental_df)), print(len(switched_df)+len(not_switched_df))

## Making plots to visualize the data

In [None]:
fig, axes = plt.subplots(5, 2, figsize=(10, 10), sharex=True)

fig.suptitle('Development of variables over the sessions', fontsize=20)
axes[0,0].set_title("Switched students",fontsize=15)
axes[0,1].set_title("Not switched students",fontsize=15)

sns.boxplot(ax=axes[0,0], data=switched_df, x='session_number', y='interval')
sns.boxplot(ax=axes[0,1], data=not_switched_df, x='session_number', y='interval')

sns.boxplot(ax=axes[1,0], data=switched_df, x='session_number', y='stars')
sns.boxplot(ax=axes[1,1], data=not_switched_df, x='session_number', y='stars')

sns.boxplot(ax=axes[2,0], data=switched_df, x='session_number', y='score')
sns.boxplot(ax=axes[2,1], data=not_switched_df, x='session_number', y='score')

sns.boxplot(ax=axes[3,0], data=switched_df, x='session_number', y='clippy')
sns.boxplot(ax=axes[3,1], data=not_switched_df, x='session_number', y='clippy')

sns.boxplot(ax=axes[4,0], data=switched_df, x='session_number', y='block_try_counter')
sns.boxplot(ax=axes[4,1], data=not_switched_df, x='session_number', y='block_try_counter')

fig.savefig('SNS_per_variable.png')

In [None]:
fig, axes = plt.subplots(5, figsize=(10,10), sharex=True)

red_patch = mpatches.Patch(color='red', label='Not Switched')
blue_patch = mpatches.Patch(color='tab:blue', label='Switched')

axes[0].legend(handles=[red_patch, blue_patch])

axes[0].tick_params(axis='both', which='both', labelbottom=True)
axes[1].tick_params(axis='both', which='both', labelbottom=True)
axes[2].tick_params(axis='both', which='both', labelbottom=True)
axes[3].tick_params(axis='both', which='both', labelbottom=True)
axes[4].tick_params(axis='both', which='both', labelbottom=True)

y = 'interval'
sns.pointplot(data=switched_df, x='session_number', y=y, ax=axes[0])
sns.pointplot(data=not_switched_df, x='session_number', y=y,color='red', ax=axes[0])

y = 'stars'
sns.pointplot(data=switched_df, x='session_number', y=y,ax=axes[1])
sns.pointplot(data=not_switched_df, x='session_number', y=y,color='red', ax=axes[1])

y = 'score'
sns.pointplot(data=switched_df, x='session_number', y=y,  ax=axes[2])
sns.pointplot(data=not_switched_df, x='session_number', y=y,color='red', ax=axes[2])

y = 'clippy'
sns.pointplot(data=switched_df, x='session_number', y=y, ax=axes[3])
sns.pointplot(data=not_switched_df, x='session_number', y=y,color='red', ax=axes[3])

y = 'block_try_counter'
sns.pointplot(data=switched_df, x='session_number', y=y, ax=axes[4]).set_label('Test')
sns.pointplot(data=not_switched_df, x='session_number', y=y,color='red',ax=axes[4])

fig.savefig('SNS_per_variable.png')


# Checking the overlap between switches and all sessions of the switched students

In [None]:
# These are the sessions of students that have switched
switched_df

# These are the switches that these students have made
real_switches = real_switched[real_switched['user_id'].isin(switched_df['student_id'])]
real_switches['up_down'].value_counts()

# Apparently, there are more students that have made switches than that are in the dataset with switches


In [None]:
def find_gradients(stud_x,y,n, if_one):
        slopes = []
        if min(n,len(stud_x)) == 0:
            pass

        else:
            parts = np.array_split(stud_x, min((n),len(stud_x)))
            for i in range(min(len(parts),n)):
                if list(parts[i])==[]:
                    break
                levels = [y[level] for level in parts[i]]
                slope = round((levels[-1]-levels[0])/len(levels),2)
                slopes.append(slope)
        while len(slopes)<n:
            slopes.append(0.0)
        return slopes

gradients = []

for sid in switched_ids:
    
    stud_x = real_switches[real_switches.user_id==sid].reset_index()
    
    sess_stud_x = switched_df[switched_df['student_id']==sid]
    sess_stud_x

    stud_x.index
    stud_x.from_no  
    
    if len(stud_x): 
        if_one=stud_x['up_down'].iloc[0]
    else:
        if_one = 0
    gradients.append(find_gradients(stud_x.index,stud_x.from_no,len(sess_stud_x),if_one))
    


switched_df['slopes'] = [item for sublist in gradients for item in sublist]
switched_df['slopes'].plot()

switched_df.sort_values(by=['student_id','datetime'])
switched_df

# Visualisations

In [None]:
for sid in switched_ids:
    student = switched_df[switched_df['student_id']==sid].reset_index()
    if len(student) > 1:
        student['stars'].plot()
    plt.title("stars of Switched Students")

In [None]:
for sid in switched_ids:
    student = switched_df[switched_df['student_id']==sid].reset_index()
    if len(student) > 1:
        student['score'].plot()
    plt.title("Score of Switched Students")

In [None]:
for sid in switched_ids:
    student = switched_df[switched_df['student_id']==sid].reset_index()
    if len(student) > 1:
        student['clippy'].plot()
    plt.title("Hints of Switched Students")

In [None]:
for sid in switched_ids:
    student = switched_df[switched_df['student_id']==sid].reset_index()
    if len(student) > 1:
        student['block_try_counter'].plot()
    plt.title("Tries of Switched Students")

##  Correlation Table


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

switched_df[["interval","stars","score","clippy","block_try_counter"]].corr().replace(1,np.nan).style.background_gradient(cmap='Blues')

print(switched_df[["interval","stars","score","clippy","block_try_counter"]].corr().replace(1,np.nan).round(decimals=3).to_latex())

X = switched_df[["interval","stars","score","clippy","block_try_counter"]].dropna()
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [round(variance_inflation_factor(X.values, i),2)
                          for i in range(len(X.columns))]
  
print(vif_data)

In [None]:
not_switched_df[["interval","stars","score","clippy","block_try_counter"]].corr().replace(1,np.nan).style.background_gradient(cmap='Blues')

print(not_switched_df[["interval","stars","score","clippy","block_try_counter"]].corr().replace(1,np.nan).round(decimals=3).to_latex())

X = not_switched_df[["interval","stars","score","clippy","block_try_counter"]].dropna()
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [round(variance_inflation_factor(X.values, i),2)
                          for i in range(len(X.columns))]
  
print(vif_data['VIF'])

## DF is in the right format, now going to perform the MLRegressions
https://www.pythonfordatascience.org/mixed-effects-regression-python/

In [None]:
not_switched_df['slopes'] = 0;
switched_df.columns
switched_df['group'] = 1
not_switched_df['group']= 0
all_sessions = pd.concat([switched_df,not_switched_df], axis=0).fillna(0)
all_sessions

In [None]:
all_sessions['session_number'].value_counts().plot()

In [None]:
all_sessions.head()

In [None]:
# visual indication of how the sessions are divided
print("group 1 is switched and group 0 is not switched \n X-axis is the slope indicating a higher/lower difficulty for the student")
all_sessions.plot.scatter('slopes','interval', c='group', cmap='bwr')
all_sessions.plot.scatter('slopes','stars', c='group', cmap='bwr')
all_sessions.plot.scatter('slopes','score', c='group', cmap='bwr')
all_sessions.plot.scatter('slopes','clippy', c='group', cmap='bwr')
all_sessions.plot.scatter('slopes','block_try_counter', c='group', cmap='bwr')

In [None]:
def show_summary(var, df):
    # Now to take a look at the interval of the students based on the group and the slopes.
    df = rp.summary_cont(df.groupby(["group", "slopes"])[var])
    print(df.to_latex())
    return df

def viz_distributions(var, df):
    """visualize the distributions in a boxplot"""
    # Lets visualize the distribution of interval by group and slope
    boxplot = df.boxplot([var],by=['group','slopes'],
                         figsize = (9, 9),
                         showmeans = True,
                         notch = True)
    boxplot.figure.savefig("{}_boxplot.png".format(var))

def fit_model(var, df):
    # Now fit a random intercept model, 
    # recall that this type of model allows for different clusters (a group) to have different intercepts

    # the vc_formula is the formula for the variance components, this is used because the slopes are uncorrelated to the 
    # intercepts
    md = smf.mixedlm("{} ~ C(group)+session_number+slopes".format(var),
                     df,
                     groups=df['student_id'],
                    vc_formula = {'group':"0+C(group)"}).fit()

    print(md.summary())
    return md

def check_normality(md):
    # Lets check for normality 
    fig,(ax1,ax2) = plt.subplots(2,figsize = (9, 9))
    sns.distplot(md.resid, hist = False, kde_kws={'shade':True, 'lw':1}, fit=stats.norm,ax=ax1)
    ax1.set_title("KDE Plot of Model Residuals (Blue) and Normal Distribution (Black)")
    ax1.set_xlabel("Residuals")

    ## Q-Q PLot
    fig = plt.figure(figsize = (16, 9))
    sm.qqplot(md.resid, dist = stats.norm, line = 's', ax = ax2)
    ax2.set_title("Q-Q Plot")

    # SK-test for normality
    labels = ["Statistic (SK-test)", "p-value"]
    norm_res = stats.shapiro(md.resid)

    for key, val in dict(zip(labels, norm_res)).items():
        print(key, val)
    if norm_res[1]<0.05:
        print("the test is significant thus normality of residuals is violated")
    else:
        print('the test is non-significant thus normality of residuals is supported')
        
def check_homoskedasticity(md):
    # Lets check for homoskedasticity
    fig, (ax1,ax2) = plt.subplots(2,figsize = (9, 9))

    sns.scatterplot(y = md.resid, x = md.fittedvalues,ax=ax1)

    ax1.set_title("RVF Plot")
    ax1.set_xlabel("Fitted Values")
    ax1.set_ylabel("Residuals")

    fig = plt.figure(figsize = (16, 9))

    sns.boxplot(x = md.model.groups, y = md.resid, ax=ax2)

    ax2.set_title("Distribution of Residuals for Interval by student")
    ax2.set_ylabel("Residuals")
    ax2.set_xlabel("Student")

    het_white_res = het_white(md.resid, md.model.exog)

    labels = ["LM Statistic", "LM-Test p-value", "F-Statistic", "F-Test p-value"]

    for key, val in dict(zip(labels, het_white_res)).items():
        print(key, val)
        
def run_analysis(var, df):
    print(show_summary(var, df))
    viz_distributions(var, df)
    print('\n')
    md = fit_model(var, df)
    check_normality(md)
    print('\n')
    check_homoskedasticity(md)
    


In [None]:
without_outliers

In [None]:
variables = ['interval','score','stars','block_try_counter','clippy']

for var in variables:
    print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\n",var)
    md = fit_model(var, all_sessions)
#     print(md.summary().as_latex())
#     print(type(md))

In [None]:
all_sessions.student_id.value_counts().mean()

In [None]:
run_analysis('interval', all_sessions)
# Assumptions violated

In [None]:
run_analysis('score', all_sessions)
# Assumptions violated

In [None]:
run_analysis('clippy', all_sessions)
# Assumptions violated

In [None]:
run_analysis('stars', all_sessions)
# Assumptions violated
# Looks like a Chi-Square Distribution --> 

In [None]:
run_analysis('block_try_counter', all_sessions)
# Assumptions violated

## normality and homoskedasticity is rejected in all variables
I suspect that this happens because there are a lot of students that have a too little sessions in this data


In [None]:
# See how much students there are with multiple sessions:
all_sessions.student_id.value_counts().value_counts()

# These are the students with more than 3 sessions in the experimental period:
oft_switch_ids = [sid for sid in  all_sessions.student_id.value_counts().keys() if all_sessions.student_id.value_counts()[sid] > 2]
print("there are {} students in the dataset with 3 or more sessions".format(len(oft_switch_ids)))

# This is a dataframe where the students with more than 3 sessions are in 
mult_switch  = all_sessions[all_sessions.student_id.isin(oft_switch_ids)]

# The switched sessions (60) and non switched sessions (50) are nicely divided
mult_switch.group.value_counts()

## Lets run the analysis on students that have 3 or more sessions in the data

In [None]:
run_analysis('interval',mult_switch)
# Assumptions violated

In [None]:
run_analysis('score',mult_switch)
# Here, the data seems good

In [None]:
run_analysis('stars',mult_switch)
# Assumptions violatedv

In [None]:
run_analysis('block_try_counter',mult_switch)
# Assumptions violated

In [None]:
run_analysis('clippy',mult_switch)
# Assumptions violated

## I was wrong, there is insufficient data when filtering on students that only have 3 or more sessions.
This could be due to a sampling error or insufficient data

#### Conclusion
without the outliers, with the sqrt of score, and the central limit theorem i get the best results:

https://sphweb.bumc.bu.edu/otlt/mph-modules/bs/bs704_probability/BS704_Probability12.html

In [None]:
all_sessions[variables]


In [None]:
# I want to remove all 'outliers' that er further than 2 SDs from the mean (within th =95% interval)
without_outliers = all_sessions[(np.abs(stats.zscore(all_sessions[variables])) < 2).all(axis=1)]
# These seem to be a bit better, nevertheless all are violated (but in the order of .02 or .03 instead of 10^-6)
# Here the values are close to normality but since the sample size is quite small, i rely on the central limit theorem (not a 
# strong argument but one that holds)

# https://sphweb.bumc.bu.edu/otlt/mph-modules/bs/bs704_probability/BS704_Probability12.html

without_outliers

In [None]:
# run_analysis('interval', without_outliers)
# Trying with log/sqrt/reciprocal transformation ==> did not work
# without_outliers['log_int'] = without_outliers['interval'].apply(np.reciprocal)
# run_analysis('log_int', without_outliers)

# Keeping the normal interval variable and stick with it
run_analysis('interval', without_outliers)

In [None]:
# run_analysis('score', without_outliers)

# Trying with log/sqrt/reciprocal transformation ==> sqrt is best!
without_outliers['sqrt_score'] = without_outliers['score'].apply(np.sqrt)
run_analysis('sqrt_score', without_outliers)

In [None]:
run_analysis('stars', without_outliers)

# Trying with log/sqrt/reciprocal transformation ==> is not going to work, sticking with the original
# without_outliers['log_stars'] = without_outliers['stars'].apply(np.reciprocal)
# run_analysis('log_stars', without_outliers)

In [None]:
run_analysis('block_try_counter', without_outliers)

# Trying with log/sqrt/reciprocal transformation ==> is not going to work, sticking with the original
# without_outliers['log_btc'] = without_outliers['block_try_counter'].apply(np.log)
# run_analysis('log_btc', without_outliers)

In [None]:
run_analysis('clippy', without_outliers)

# Trying with log/sqrt/reciprocal transformation ==> is not going to work, sticking with the original
# without_outliers['log_clippy'] = without_outliers['clippy'].apply(np.reciprocal)
# run_analysis('log_clippy', without_outliers)