In [1]:
#   % run all_utilities.ipynb

In [2]:
def unique_counts(col_names, mobile_data):
    for count in range(len(col_names)):   
        list_names[count] = []
        list_names[count] = mobile_data[col_names[count]].unique()
        num_unique = mobile_data[col_names[count]].nunique(dropna = True)
        print("\nThere are " + str(num_unique) + " unique " + str(col_names[count]) + "s.")
        all_uniques.append(list_names[count])
    return (all_uniques)

In [3]:
def replace_copies(df, replace_this, replace_with):
    '''
    goes through raw_data and replaces specific strings with another specific string
    '''
    new_data = pd.DataFrame()
    for count in range(len(df)):
        for num in range(len(replace_this)):
            if df['name'][count] == replace_this[num]:
                #print("repeat")
                df = df.replace(to_replace = replace_this[num], 
                    value = replace_with[num]) 
    df = df.groupby(['name']).sum()
    df = df.reset_index()
    return df

In [4]:
def normalize_data(raw_data):
    '''
    Normalizes raw data based off the min/max
    '''
    x = raw_data[['values']].values.astype(float)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    return(x_scaled)

In [5]:
def apply_normalizer(normalized_data, device_type, device_list):
    '''
    Adds normalized data to dataframe 
    '''
    normalized_df = pd.DataFrame(normalized_data)
    normalized_df.columns = ['normalized']
    for count in range(len(normalized_df)):
         device_list.append(device_type)
    normalized_df['device'] = device_list        
    return normalized_df

In [6]:
def get_top_users(df, NUM_TOP):
    '''
    Takes the number of top users (NUM_TOP) and return dataframe with usernames and value counts
    '''
    user_counts = pd.DataFrame()
    user_counts  = df['username'].value_counts()
    user_counts = user_counts.reset_index()
    user_counts.columns = ['username', 'no. clicks']
    top_users = pd.DataFrame()
    for count in range(NUM_TOP):
        top_users[count] = user_counts.loc[count]
    top_users = top_users.T
#    top_users = top_users.drop(index = 3)
    return(top_users.reset_index())

In [7]:
def normalize_data_users(raw_data):
    '''
    Normalizes data based on min/max of indiviudal USER not total
    '''
    x_scaled = raw_data.assign(values=raw_data.groupby('username').transform(lambda x: (x - x.min()) / (x.max()- x.min())))
    return (x_scaled)

In [8]:
def convert_time (df):
    '''
    input: dataframe with datetime
    output: dataframe with timestamp column that holds datetime as string
    *except case for when milliseconds do not appear in the datetime time column
    '''
    timestamp_list = []
    count = 0

    for num in range(len(df)):
        try:    
            timestamp_list.append(datetime.datetime.strptime(df['time'][num], datetimeFormat))
        except:
            timestamp_list.append(datetime.datetime.strptime(df['time'][num], '%Y-%m-%d %H:%M:%S %Z'))
            count += 1
    df['timestamp'] = timestamp_list
    print("No milliseconds count: " + str(count))
    df = df.sort_values(by = 'timestamp', ascending = True)
    return(df)

In [9]:
def create_count_df_user (df):
    '''
    input: dataframe with data and datetime as a string
    output: dataframe with username, device, week_year, and count columns
    '''
    df['week_year'] = df['timestamp'].apply(lambda x: "%02d/%d" % (x.week, x.year))

    df_grouped = df.groupby(['username', 'device'])

    df_counts = df_grouped.week_year.value_counts()
    df_counts.columns = ['username', 'device', 'week_year', 'count']
    df_counts = df_counts.to_frame()
    df_counts.columns = ['count']
    df_sorted_counts = df_counts.sort_values(by = 'week_year', ascending = True)
    df_sorted_counts = df_sorted_counts.reset_index()

    return (df_sorted_counts)

In [10]:
def create_count_df_mobile (df):
    '''
    input: dataframe with data and datetime as a string
    output: dataframe with username, device, week_year, and count columns
    '''
    df['week_year'] = df['timestamp'].apply(lambda x: "%02d/%d" % (x.week, x.year))

    df_grouped = df.groupby(['username'])

    df_counts = df_grouped.week_year.value_counts()
    df_counts.columns = ['username', 'week_year', 'count']
    df_counts = df_counts.to_frame()
    df_counts.columns = ['count']
    df_sorted_counts = df_counts.sort_values(by = 'week_year', ascending = True)
    df_sorted_counts = df_sorted_counts.reset_index()

    return (df_sorted_counts)

In [11]:
def create_count_df_perweek (df):
    '''
    input: dataframe with week_year, count (per user), device type
    output: dataframe with total counts per week_year, grouped by device type
    '''
    df_grouped = df.groupby(['week_year', 'device'])
    
    df_counts = df_grouped['count'].sum()
    df_counts.columns = ['week_year', 'device', 'clicks_per_week']
    df_counts = df_counts.to_frame()
    df_counts.columns = ['clicks_per_week']
    df_sorted_counts = df_counts.sort_values(by = 'week_year', ascending = True)
    df_sorted_counts = df_sorted_counts.reset_index()
    
    return (df_sorted_counts)

In [12]:
def plot_distinct_and_correct(df_distinct, df_correct, exp):
    '''
    input: dataframes distinct data and final correctness data, and number of user experience groups
    output: two horizontal bar plots 
    '''
    print("    *delta_mean (positive = less than num_distinct average / required fewer attempts)")
    plt.figure(figsize=(15,10))

    sns.catplot(x = 'delta_mean', y = 'question', hue = 'user_exp', 
                data = df_distinct, kind = "bar", legend = True, aspect = 2)
    plt.title("Distance From Mean Per Question (Distinct Attempts)" + str(exp), fontsize = 14)
    plt.xlabel('delta_mean', fontsize=13)
    plt.ylabel('question', fontsize=13)
    plt.xlim(-5,5)
    plt.xticks(
        rotation=45, 
        horizontalalignment='right',
        fontsize='large'  
    )

    plt.show()

    print("    *delta_mean (negative = greater than accuracy average / worse accuracy)")
    plt.figure(figsize=(15,10))

    sns.catplot(x = 'delta_mean', y = 'question', hue = 'user_exp', 
                data = df_correct, kind = "bar", legend = True, aspect = 2)
    plt.title("Distance From Mean Per Question (Accuracy)" + str(exp), fontsize = 14)
    plt.xlabel('delta_mean', fontsize=13)
    plt.ylabel('question', fontsize=13)
    plt.xlim(-.15,.05)
    plt.xticks(
        rotation=45, 
        horizontalalignment='right',
        fontsize='large'  
    )

    plt.show()

In [13]:
def wilcoxon_test(df_dist_1, df_dist_2, df_corr_1, df_corr_2):
    t, p_value = wilcoxon(df_dist_1, df_dist_2)
    print("Delta Mean Distinct Attempts:")
    print('  Statistic=%.3f, p=%.8f' % (t, p_value))

    alpha = 0.05
    if p_value > alpha:
        print('  Samples look equal (fail to reject H0)')
    else:
        print('  Samples do not look equal (reject H0)')

    t, p_value = wilcoxon(df_corr_1, df_corr_2)
    print("\nDelta Mean Correctness:")
    print('  Statistic=%.3f, p=%.8f' % (t, p_value))

    alpha = 0.05
    if p_value > alpha:
        print('  Samples look equal (fail to reject H0)')
    else:
        print('  Samples do not look equal (reject H0)')

In [14]:
def wilcoxon_test_bon(df_dist_1, df_dist_2, df_corr_1, df_corr_2):
    BON = 3
    #number of comparisons ^
    
    t, p_value = wilcoxon(df_dist_1, df_dist_2)
    print("Delta Mean Distinct Attempts:")
    print('  Statistic=%.3f, p=%.8f' % (t, p_value))

    alpha = 0.05 / BON
    if p_value > alpha:
        print('  Samples look equal (fail to reject H0)')
    else:
        print('  Samples do not look equal (reject H0)')

    t, p_value = wilcoxon(df_corr_1, df_corr_2)
    print("\nDelta Mean Correctness:")
    print('  Statistic=%.3f, p=%.8f' % (t, p_value))

    alpha = 0.05 / BON
    if p_value > alpha:
        print('  Samples look equal (fail to reject H0)')
    else:
        print('  Samples do not look equal (reject H0)')