In [23]:
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install sklearn
!{sys.executable} -m pip install bokeh



In [None]:
import pandas as pd
from copy import copy
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
import datetime as dt
import numpy as np
import itertools
from statistics import median

In [12]:
from bokeh.io import output_notebook, show
from bokeh.plotting import figure

from bokeh.models import Range1d

output_notebook()

# Load Data 

In [13]:
df = pd.read_csv("~\\Documents\\Data\\toy_models\\automateusz_minimal_big\\automateusz_minimal.csv")
#get kpi_axis
kpi_axis = 'kpi'
time_axis = 'time'

df[time_axis] = pd.to_datetime(df[time_axis],format = '%d/%m/%Y')
df.head()

Unnamed: 0,time,system,main_activity,kpi,income,married,psyche,user_age,user_country,user_id
0,2019-09-08,win-pc,flying_pigeon,52.026762,medium,no,Geppetto,21-24,france,0
1,2019-09-08,android-mob,magical_fairy,30.206628,low,no,Pinocchio,21-24,poland,1
2,2019-09-08,android-mob,magical_fairy,55.152838,low,no,Pinocchio,18-21,france,2
3,2019-09-08,ios-mob,magical_fairy,49.997715,low,yes,Mangiafuoco,30-35,germany,3
4,2019-09-08,android-tv,magical_fairy,71.797611,low,no,Alidohro,18-21,poland,4


# Generate one-hot encodings

In [14]:
y_before =  df[df[time_axis] <= '2019-09-11'][kpi_axis]
y_after  =  df[df[time_axis]  > '2019-09-11'][kpi_axis]

#one-hot encoding categorical features
for col in df.drop([kpi_axis,time_axis],axis=1).columns:
    one_hot = pd.get_dummies(df[col]) 
    df = df.drop(col,axis = 1) 
    df = df.join(one_hot) 

df.head()

Unnamed: 0,time,kpi,android-mob,android-tv,ios-mob,ios-pc,win-pc,becoming_human,donkey_transformation,flying_pigeon,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,2019-09-08,52.026762,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2019-09-08,30.206628,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2019-09-08,55.152838,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2019-09-08,49.997715,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2019-09-08,71.797611,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Fit regression model left and right

In [15]:
X_before =  df[df[time_axis] <= '2019-09-11'].drop([kpi_axis,time_axis],axis = 1).to_numpy()
X_after  =  df[df[time_axis] >  '2019-09-11'].drop([kpi_axis,time_axis],axis = 1).to_numpy()

features = df.drop([kpi_axis,time_axis],axis = 1).columns

#training left and right
regression_model_before = LassoCV(cv = 2)
regression_model_after = LassoCV(cv = 2)

regression_model_before.fit(X_before,y_before)
regression_model_after.fit(X_after,y_after)

#plotting results
features = df.drop([kpi_axis,time_axis],axis = 1).columns
dweights = regression_model_after.coef_ - regression_model_before.coef_ 
index = np.argsort(-abs(dweights))
x_axis = features[index[0:3]].to_list()

p = figure(x_range=x_axis,title = "Feature weights difference",plot_width=1000)
p.vbar(x=x_axis, top=(abs(dweights[index[0:3]])),width = 0.8)
show(p)

#export to csv
w = list(abs(dweights[index[0:3]]))
pd.DataFrame(np.array([w,x_axis]).T, columns=['feature_weights_differences','features']).to_csv('linear_regression_weight_differences.csv',index=False)

# Generate plots for mean KPIs

In [16]:
#mean kpi

#data
time =[]
mean_y=[]
for d in range(8,18):
    if d < 10:
        day_str = '2019'+'-09' +'-0' +str(d)
    else:
        day_str = '2019'+'-09' +'-' +str(d)
    mean_y.append(np.mean(df[df[time_axis] == day_str ][kpi_axis]))
    time.append(day_str)
    
#single model
time =[]
mean_y_single=[]
for d in range(8,18):
    if d < 10:
        day_str = '2019'+'-09' +'-0' +str(d)
    else:
        day_str = '2019'+'-09' +'-' +str(d)
    print(day_str)
    x = df[df[time_axis] == day_str ].drop([kpi_axis,time_axis],axis = 1).to_numpy()
    mean_y_single.append(np.mean(regression_model_before.predict(x)))
    time.append(day_str)
    

#two models
time =[]
mean_y_two=[]
for d in range(8,18):
    if d < 10:
        day_str = '2019'+'-09' +'-0' +str(d)
    else:
        day_str = '2019'+'-09' +'-' +str(d)
    x = df[df[time_axis] == day_str ].drop([kpi_axis,time_axis],axis = 1).to_numpy()
    if d<=11:
        mean_y_two.append(np.mean(regression_model_before.predict(x)))
    else:
        mean_y_two.append(np.mean(regression_model_after.predict(x)))
    time.append(day_str)

p = figure(title = 'KPI over time')
p.line(np.arange(0,len(mean_y)),mean_y,color="black",line_width=2)
p.square(np.arange(0,len(mean_y)),mean_y,size=10,color="black",alpha = 1)
show(p)    

p = figure(title = 'KPI over time')
p.line(np.arange(0,len(mean_y)),mean_y,color="black",line_width=2)
p.square(np.arange(0,len(mean_y)),mean_y,size=10,color="black",alpha = 1)
p.line(np.arange(0,len(mean_y)),mean_y_single,color="red",line_width=2,line_dash ='dashed')
p.square(np.arange(0,len(mean_y)),mean_y_single,size=10,color="red",alpha = 1)
show(p)    
    
p = figure(title = 'KPI over time')
p.line(np.arange(0,len(mean_y)),mean_y,color="black",line_width=2)
p.square(np.arange(0,len(mean_y)),mean_y,size=10,color="black",alpha = 1)
p.line(np.arange(0,len(mean_y)),mean_y_two,color="red",line_width=2,line_dash ='dashed')
p.square(np.arange(0,len(mean_y)),mean_y_two,size=10,color="red",alpha = 1)
show(p)    
    
#export to csv
pd.DataFrame(np.array([time,mean_y_single,mean_y_two,mean_y]).T, columns=['time','single_model_pred','two_models_pred','true']).to_csv('regression_model_predictions.csv',index=False)

2019-09-08
2019-09-09
2019-09-10
2019-09-11
2019-09-12
2019-09-13
2019-09-14
2019-09-15
2019-09-16
2019-09-17


# Define function for one-hot encoding higher-order features

In [17]:
def binarize(df,cols,kpi_axis,time_axis,order):
    cols = cols.drop([kpi_axis,time_axis])
    features = []
    for k in range(0,order):
        features.append(cols)
    fs = []
    for f in itertools.product(*features):
      #  list(set(f)).sort()
        f = np.unique(f)
        fs.append(tuple(f))
    fs = tuple(set(i for i in fs))
    print(fs)
    for f in fs:
        print(len(f))
        states =[]
        for d in f:
            states.append(tuple(set(df[d].astype('category'))))
        for state in itertools.product(*states):
            z = 1
            name = str()
            for d in range(0,len(f)):
                z = z*df[f[d]]==state[d]
                name +=  f[d] + " == " +str(state[d])
                if d<len(f)-1:
                   name += " AND "
            df[name] = z
         

    for d in cols:
        df = df.drop([d],axis = 1)
    return df

# Load data again

In [20]:
df = pd.read_csv("~\\Documents\\Data\\toy_models\\automateusz_minimal_big\\automateusz_minimal.csv")
df[time_axis] = pd.to_datetime(df[time_axis],format = '%d/%m/%Y')
df.head()

Unnamed: 0,time,system,main_activity,kpi,income,married,psyche,user_age,user_country
0,2019-09-08,win-pc,flying_pigeon,52.026762,medium,no,Geppetto,21-24,france
1,2019-09-08,android-mob,magical_fairy,30.206628,low,no,Pinocchio,21-24,poland
2,2019-09-08,android-mob,magical_fairy,55.152838,low,no,Pinocchio,18-21,france
3,2019-09-08,ios-mob,magical_fairy,49.997715,low,yes,Mangiafuoco,30-35,germany
4,2019-09-08,android-tv,magical_fairy,71.797611,low,no,Alidohro,18-21,poland


# Fit regression model left and right with high-order features

In [21]:
#get kpi_axis
kpi_axis = 'kpi'
time_axis = 'time'
y_before =  df[df[time_axis] <= '2019-09-11'][kpi_axis]
y_after  =  df[df[time_axis]  > '2019-09-11'][kpi_axis]

#one-hot encoding categorical features
df = binarize(df,df.columns,kpi_axis,time_axis,3)

X_before =  df[df[time_axis] <= '2019-09-11'].drop([kpi_axis,time_axis],axis = 1).to_numpy()
X_after  =  df[df[time_axis] >  '2019-09-11'].drop([kpi_axis,time_axis],axis = 1).to_numpy()

features = df.drop([kpi_axis,time_axis],axis = 1).columns

#training left and right
regression_model_before = LassoCV(cv = 2)
regression_model_after = LassoCV(cv = 2)

regression_model_before.fit(X_before,y_before)
regression_model_after.fit(X_after,y_after)

#plotting results
features = df.drop([kpi_axis,time_axis],axis = 1).columns
dweights = regression_model_after.coef_ - regression_model_before.coef_ 
index = np.argsort(-abs(dweights))
x_axis = features[index[0:3]].to_list()

p = figure(x_range=x_axis,title = "Feature weights difference",plot_width=1000)
p.vbar(x=x_axis, top=(abs(dweights[index[0:3]])),width = 0.8)
show(p)

#export to csv
w = list(abs(dweights[index[0:3]]))
pd.DataFrame(np.array([w,x_axis]).T, columns=['feature_weights_differences','features']).to_csv('linear_regression_weight_differences_third_order.csv',index=False)

(('married', 'user_country'), ('income', 'system', 'user_country'), ('main_activity',), ('user_age',), ('psyche', 'system', 'user_age'), ('income', 'main_activity', 'system'), ('income', 'user_age'), ('income', 'main_activity'), ('income', 'married', 'psyche'), ('main_activity', 'psyche', 'system'), ('income', 'psyche', 'user_age'), ('main_activity', 'married', 'user_age'), ('system', 'user_country'), ('psyche', 'user_age', 'user_country'), ('system', 'user_age', 'user_country'), ('main_activity', 'user_age'), ('married', 'psyche', 'user_age'), ('system',), ('married', 'psyche'), ('income', 'system'), ('income', 'psyche', 'system'), ('main_activity', 'married', 'system'), ('income', 'main_activity', 'user_country'), ('income', 'married', 'user_age'), ('income',), ('main_activity', 'system'), ('main_activity', 'psyche', 'user_country'), ('user_country',), ('psyche', 'system', 'user_country'), ('psyche', 'user_age'), ('married', 'psyche', 'system'), ('income', 'main_activity', 'married')

# CATE for feature extraction

In [15]:
df = pd.read_csv("~\\Documents\\Data\\toy_models\\automateusz_minimal_big\\automateusz_minimal_big.csv")
df[time_axis] = pd.to_datetime(df[time_axis],infer_datetime_format=True)

#get kpi_axis
kpi_axis = 'kpi'
time_axis = 'time'
y_before =  df[df[time_axis] <= '2019-09-11'][kpi_axis]
y_after  =  df[df[time_axis]  > '2019-09-11'][kpi_axis]


df = binarize(df,df.columns,kpi_axis,time_axis,3)
df_before = df[df[time_axis] <= '2019-09-11']
df_after  = df[df[time_axis] > '2019-09-11']
features = copy(df.drop([time_axis,kpi_axis], axis=1).columns)

K = 3 #number of subgroups to detect
subgroups=[]
score=[]
for k in range(0,K):
    CATE = []
    y_before = df_before[kpi_axis]
    y_after= df_after[kpi_axis]
    
    #compute CATEs for all subgroups
    for d in features:
        g = df_before[d] == True
        m_before = np.mean(y_before[g])
        g = df_after[d] == True
        m_after = np.mean(y_after[g])
        CATE.append(m_after-m_before)
    
    #find subgroup with biggest CATE
    index = np.argsort(-abs(np.array(CATE)))
    subgroups.append(features[index[0]])
    score.append(abs( CATE [index[0]]))
    
    #remove found subgroups from dataset
    df_before = df_before[df_before[features[index[0]]] == False]
    df_after = df_after[df_after[features[index[0]]] == False] 
    features = features.drop(features[index[0]])
    
p = figure(x_range=subgroups,title = "Conditional Average Treatment Effect",plot_width=1200,)
p.vbar(x=subgroups, top=score,width = 0.8,color='black')
show(p)    

#export to csv
pd.DataFrame(np.array([score,subgroups]).T, columns=['CATE','features']).to_csv('CATE_scores.csv',index=False)


(('income', 'system', 'user_age'), ('main_activity', 'married', 'user_country'), ('main_activity', 'system', 'user_age'), ('married', 'system', 'user_age'), ('income', 'psyche', 'user_country'), ('income', 'main_activity', 'user_age'), ('income',), ('income', 'user_age', 'user_country'), ('income', 'married', 'psyche'), ('married', 'user_country'), ('psyche',), ('main_activity', 'psyche', 'user_country'), ('main_activity', 'user_age', 'user_country'), ('income', 'psyche'), ('main_activity', 'system'), ('married', 'psyche', 'user_country'), ('system', 'user_age'), ('married', 'user_age', 'user_country'), ('main_activity', 'married', 'psyche'), ('psyche', 'system', 'user_country'), ('psyche', 'user_country'), ('user_age', 'user_country'), ('income', 'system', 'user_country'), ('income', 'married', 'system'), ('main_activity',), ('system',), ('income', 'main_activity'), ('main_activity', 'user_age'), ('married', 'psyche'), ('income', 'system'), ('married', 'system', 'user_country'), ('inc