In [None]:
import pandas as pd
pd.options.display.max_rows=6
import json
import re
import sys

import logging
logging.basicConfig(filename='/home/jubauser1/jzou/dcm_account7252/user_path/model/path_permutation.log', 
                    level=logging.INFO, 
                    format='%(asctime)s %(message)s')

def extract_activity_count(s):
    '''
    Args: str, eg. 'impr(23)'
    Returns:
    tuple, (str, int), eg. ('impr', 23)
    '''
    m = re.match('(.*)\((\d+)\)', s)
    return m.group(1), int(m.group(2))

def path_to_list(s):
    '''
    Args:
    str, path str, eg. 'impr(1) -> click(2) -> hva(3)'
    Returns:
    list of activity, eg.['impr', 'click', 'hva']
    '''
    l = s.split(' -> ')
    l = [extract_activity_count(x)[0] for x in l]
    return l

def path_str_permutaion_counts(s, permutation):
    '''count each activity in order of path_permutaion
    Args:
    s: str, eg. impr(1) -> click(2) -> hva(3)
    permutation: list, eg. [impr, click, impr, hva, transaction]
    
    Retruns,
    list of int, [1, 2, 0, 3, 0]
    '''
    l = s.split(' -> ')
    i = 0
    result = [0]*len(permutation)
    for x in l:
        act, count = extract_activity_count(x)
        while permutation[i] != act:
            i += 1
        result[i] = count
        i += 1
    return result
        
path_str_permutaion_counts('impr(1) -> click(2) -> hva(3)', ['click', 'impr', 'click', 'impr', 'hva', 'transaction'])

In [None]:
#user_path = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/user_path/model/user_path.csv')
user_path = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/user_path/model/user_path_added_branded_nonbranded_no_hva.csv')
user_path

In [None]:
user_path['group'].value_counts()

In [None]:
sampled_user_path = user_path[user_path['group'] == 1].copy()

sampled_user_path = sampled_user_path.append(user_path[user_path['group'] == 2])#.sample(n=4000, random_state=0))

sampled_user_path = sampled_user_path.append(user_path[user_path['group'] == 3])#.sample(n=4000, random_state=0))

sampled_user_path = sampled_user_path.append(user_path[user_path['group'] == 4].sample(n=100000, random_state=0))

sampled_user_path = sampled_user_path[~sampled_user_path['user_id'].isin(
    {'AMsySZYWhOOVUVUkDHI3Jh0S0FdR',
    'AMsySZa0slNn24JQM3dKVY-sh6P1',
    'AMsySZb3VlBhjHZk0zEjIj_9ApoG'})]

sampled_user_path.reset_index(drop=True, inplace=True)

sampled_user_path

In [None]:
sampled_user_path['M1'] = sampled_user_path['group'].apply(lambda x: {1: 1, 2:0, 3:0, 4:0}[x])
sampled_user_path['M2'] = sampled_user_path['group'].apply(lambda x: {1: 1, 2:1, 3:0, 4:0}[x])
sampled_user_path['M3'] = sampled_user_path['group'].apply(lambda x: {1: 1, 2:1, 3:1, 4:0}[x])
sampled_user_path['M4'] = sampled_user_path['group'].apply(lambda x: {1: 1, 2:1, 3:0, 4:0}[x])

In [None]:
sampled_user_path.to_csv('/home/jubauser1/jzou/dcm_account7252/user_path/model/sampled_user_path.csv', index=False)

# path permutation

In [None]:
user_path['path_list'] = user_path['path'].apply(path_to_list)

user_path['length'] = user_path['path_list'].apply(len)

user_path.sort_values(by='length', inplace=True)

user_path.reset_index(inplace=True, drop=True)

# drop user has path longer than 300
user_path = user_path[user_path['length'] < 300]

In [None]:
user_path

In [None]:
# get path permutaion
all_path = list(user_path['path_list'])
path_permutation = list()
x = 0
for pth in all_path:
    n = len(pth)
    i = 0
    if n == 0: continue
    for j in range(len(path_permutation)):
        if pth[i] == path_permutation[j]:
            i += 1
            if i == n: break
    path_permutation.extend(pth[i:n])
    x += 1
    logging.info('%d: %d'%(x, len(path_permutation)))

In [None]:
set(path_permutation), len(path_permutation)

In [None]:
path_permutation = [x for x in path_permutation if x != 'transaction']
path_permutation.append('transaction')

In [None]:
# save path permutation
json.dump(path_permutation, 
          open('/home/jubauser1/jzou/dcm_account7252/user_path/model/path_permutation.json', 'w'))

In [None]:
json.dump(path_permutation, 
          open('/home/jubauser1/jzou/dcm_account7252/user_path/model/path_permutation_no_hva.json', 'w'))

In [None]:
path_permutation = json.load(
    open('/home/jubauser1/jzou/dcm_account7252/user_path/model/path_permutation.json', 'r'))

In [None]:
set(path_permutation)

# count by permutation

In [None]:
sampled_user_path

In [None]:
len(path_permutation)

In [None]:
# json
count_by_permutation = list()
for index in sampled_user_path.index:
    logging.info(index)
    pth = sampled_user_path.loc[index, 'path']
    result = path_str_permutaion_counts(pth, path_permutation)
    count_by_permutation.append(result[:-1])
    del result

In [None]:
len(count_by_permutation[0])

In [None]:
# dataframe
count_by_permutation = pd.DataFrame(columns=range(len(path_permutation)))
for index in user_path.index:
    logging.info(index)
    pth = user_path.loc[index, 'path']
    result = path_str_permutaion_counts(pth, path_permutation)
    count_by_permutation = count_by_permutation.append(pd.Series(result), ignore_index=True)

In [None]:
json.dump(count_by_permutation,
          open('/home/jubauser1/jzou/dcm_account7252/user_path/model/count_by_permutation_no_hva.json', 'w'))

In [None]:
sys.getsizeof(count_by_permutation)

# fit model

In [46]:
from sklearn import linear_model, model_selection, metrics, feature_selection
import pickle

In [48]:
len(count_by_permutation)

195313

In [47]:
sampled_user_path['group'].value_counts()

4    100000
3     69674
2     21679
1      3960
Name: group, dtype: int64

### m1

In [None]:
Y1 = list(sampled_user_path['M1']) 
len(Y1)

In [None]:
X_train_1, X_test_1, y_train_1, y_test_1 = model_selection.train_test_split(count_by_permutation, Y1, random_state=0)

In [None]:
m1 = linear_model.LogisticRegression()

In [None]:
m1.fit(X_train_1, y_train_1)

In [None]:
m1.score(X_train_1, y_train_1)

In [None]:
y1_pred = m1.predict(X_test_1)
metrics.r2_score(y_test_1, y1_pred)

In [None]:
f, p = feature_selection.f_regression(X_test_1, y_test_1, center=True)

In [None]:
f_p = pd.DataFrame()
f_p = f_p.append(pd.Series(path_permutation), ignore_index=True)
f_p = f_p.append(pd.Series(f), ignore_index=True)
f_p = f_p.append(pd.Series(p), ignore_index=True)
f_p

In [None]:
f_p.index = ['path_permutation', 'F-Values', 'P-Values']

In [None]:
f_p

In [None]:
len(f)

In [None]:
len(p)

In [None]:
df = pd.DataFrame({'y_test': y_test_1, 'y_pred': y1_pred})
df

In [None]:
gain_chart = pd.DataFrame([{'sample_size': len(df[df['y_test'] == 0]), 
                            'y_pred_0': len(df[(df['y_test'] == 0) & (df['y_pred'] == 0)]), 
                            'y_pred_1': len(df[(df['y_test'] == 0) & (df['y_pred'] == 1)]),
                            'y_actual': 'y_actual_0',},
                           {'sample_size': len(df[df['y_test'] == 1]), 
                            'y_pred_0': len(df[(df['y_test'] == 1) & (df['y_pred'] == 0)]), 
                            'y_pred_1': len(df[(df['y_test'] == 1) & (df['y_pred'] == 1)]),
                            'y_actual': 'y_actual_1',}
                          ])

gain_chart.set_index(keys='y_actual', inplace=True)

gain_chart

In [None]:
gain_chart

In [None]:
pickle.dump(m1, open('/home/jubauser1/jzou/dcm_account7252/user_path/model/m1.sav', 'wb'))

# 4 models

In [49]:
len(list(sampled_user_path[sampled_user_path['group'] != 3]['M4']) )

125639

In [50]:
left_index = sampled_user_path[sampled_user_path['group'] == 3].index[0]
right_index = sampled_user_path[sampled_user_path['group'] == 3].index[-1]
X = {
    'm1': count_by_permutation,
    'm2': count_by_permutation,
    'm3': count_by_permutation,
    'm4': count_by_permutation[:left_index] + count_by_permutation[right_index+1:]
}

Y = {
    'm1': list(sampled_user_path['M1']),
    'm2': list(sampled_user_path['M2']),
    'm3': list(sampled_user_path['M3']),
    'm4': list(sampled_user_path[sampled_user_path['group'] != 3]['M4'])  
}

for mx in ['m1', 'm2', 'm3', 'm4']:
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X[mx], Y[mx], random_state=0)
    m = linear_model.LogisticRegression()
    m.fit(X_train, y_train)
    
    # r_square
    y_pred = m.predict(X_test)
    r2 = metrics.r2_score(y_test, y_pred)
    print('%s: %f'%(mx, r2))
    
    # F and P value
    f_values, p_values = feature_selection.f_regression(X_test, y_test, center=True)
    f_p = pd.DataFrame()
    f_p = f_p.append(pd.Series(path_permutation), ignore_index=True)
    f_p = f_p.append(pd.Series(f_values), ignore_index=True)
    f_p = f_p.append(pd.Series(p_values), ignore_index=True)
    f_p
    
    # gain_chart
    df = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
    gain_chart = pd.DataFrame([{'sample_size': len(df[df['y_test'] == 0]), 
                                'y_pred_0': len(df[(df['y_test'] == 0) & (df['y_pred'] == 0)]), 
                                'y_pred_1': len(df[(df['y_test'] == 0) & (df['y_pred'] == 1)]),
                                'y_actual': 'y_actual_0',},
                               {'sample_size': len(df[df['y_test'] == 1]), 
                                'y_pred_0': len(df[(df['y_test'] == 1) & (df['y_pred'] == 0)]), 
                                'y_pred_1': len(df[(df['y_test'] == 1) & (df['y_pred'] == 1)]),
                                'y_actual': 'y_actual_1',}
                              ])
    gain_chart.set_index(keys='y_actual', inplace=True)
    gain_chart
    
    f_p.to_csv('/home/jubauser1/jzou/dcm_account7252/user_path/model/f&p_values_%s.csv'%mx)
    gain_chart.to_csv('/home/jubauser1/jzou/dcm_account7252/user_path/model/gain_chart_%s.csv'%mx)
    pickle.dump(m, open('/home/jubauser1/jzou/dcm_account7252/user_path/model/%s.sav'%mx, 'wb'))

m1: -0.063807


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


m2: -0.047173
m3: 0.857515
m4: 0.775955


In [None]:
{'m1': -0.063807, 'm2': -0.047173, 'm3': 0.857515, 'm4': 0.775955}

# read back

In [None]:
import pickle

In [None]:
m1 = pickle.load(open('m1.sav', 'rb'))

In [None]:
m1.describe()