In [None]:
import pandas as pd
import numpy as np
import datetime
import time
from pytz import UTC
from pytz import timezone
import json
from library.utils.setting import DATA_SPLITS_STUDY_START_DATE, DATA_SPLITS_STUDY_END_DATE
from utils.getDataSplits import getDaywiseSplitsForEpoch

In [None]:
data_file = '/Users/yasaman/UWEXP/data/aggregated/calls_test.txt'
data_types = {
       "_id": "int32", 
      "timestamp": "int64", 
      "device_id": "str", 
      "call_type": "int32", 
      "call_duration": "int32", 
      "trace": "str"
}
data = pd.read_csv(data_file,
                   header=0,
                   dtype=data_types,
                   sep='\t', 
                   lineterminator='\n',
                   encoding = "ISO-8859-1")
data.head(15)

In [None]:
device_types_file = '/Users/yasaman/UWEXP/script-input/sensors/device_id_type.json'
with open(device_types_file, 'r') as fileObj:
    device_types = json.load(fileObj)

In [None]:
# iOS
INCOMING_IOS = 1
CONNECTED_IOS = 2
DIALING_IOS = 3
DISCONNECTED_IOS = 4
# Android (reference)
INCOMING = 1
OUTGOING = 2
MISSED = 3
def combine_ios_records(session:pd.DataFrame, columns:list, device_types:dict)->pd.DataFrame:
    """\
    Combines information of iOS records belonging to a certain call session of a certain device_id 
    to produce a single record to represent the call session as incoming, outgoing, or missed in a
    way that resembles how information are recorded for Android calls.
    """
    device_id = session['device_id'].iloc[0]
    if device_id not in device_types:
        print('device type not available for {}'.format(device_id))
        return pd.DataFrame(columns=columns)
    device_type = device_types[device_id]
    trace = session['trace'].iloc[0]
    
    if device_type == 'iOS':
        session_states = session['call_type'].unique()
        
        if DISCONNECTED_IOS not in session_states:
            print('iOS call sessions must have a disconnected state\n\t not the case in {}'.format(trace))
            return pd.DataFrame(columns=columns)
        
        if INCOMING_IOS in session_states and DIALING_IOS in session_states:
            print('iOS call sessions cannot be both incoming and outgoing\n\t not the case in {}'.format(trace))
            return pd.DataFrame(columns=columns)
        
        if CONNECTED_IOS not in session_states:
            call_type = MISSED
            if DIALING_IOS in session_states:
                # NOTE depending on how Android record unsuccessful outgoing calls I may need to
                #      discard records with DIALING_IOS in their sessions
                #print('iOS call sessions of dialing a number but not connecting are discarded')
                #return pd.DataFrame(columns=columns)
                pass
        else:
            session = session[session['call_type'] != CONNECTED_IOS]
            if INCOMING_IOS in session_states:
                call_type = INCOMING
            if DIALING_IOS in session_states:
                call_type = OUTGOING
        
        if len(session[session['call_type'] == INCOMING_IOS]) > 1:
            print('multiple incoming records found for iOS call session {}'.format(trace))
        
        if len(session[session['call_type'] == DIALING_IOS]) > 1:
            print('multiple dialing records found for iOS call session {}'.format(trace))
        
        if len(session[session['call_type'] == DISCONNECTED_IOS]) > 1:
            print('multiple disconnected records found for iOS call session {}'.format(trace))
        
        call_duration = session.sort_values(by=['call_type', 'call_duration'], 
                                            ascending=[False, False])['call_duration'].iloc[0]
        
        session_summary = session.sort_values(by='call_type').iloc[0]
        session_summary['call_type'] = call_type
        session_summary['call_duration'] = call_duration
        session_summary = pd.DataFrame([session_summary[columns]])
        return session_summary
    else:
        return session[columns]

#def correct_ios_coding(table:pd.DataFrame, device_types_file:str)->pd.DataFrame:
#    if table.shape[0] == 0:
#        return table
#    with open(device_types_file, 'r') as fileObj:
#        device_types = json.load(fileObj)
#    table = table.groupby(by=['device_id', 'trace']).apply(combine_ios_records, device_types)
#    return table

In [None]:
org_columns = list(data.columns)
columns = org_columns.copy()
columns.remove('device_id')
columns.remove('trace')
#data.groupby(by=['device_id', 'trace']).apply(combine_ios_records, columns, device_types).reset_index()[org_columns].sort_values('_id')
data.groupby(by=['device_id', 'trace'], as_index=False).apply(combine_ios_records, org_columns, device_types).sort_values('_id')

In [None]:
def convert_timezone(df:pd.DataFrame, tz_:str='US/Eastern', cols:dict={'timestamp':'datetime_EST'}) -> None:
    """\
    converts columns of dataframe df to timezone and adds them to the dataframe under the associated name
    NOTE that the changes happen in place
    """
    
    # TO-DO check if tz_ is a valid timezone specifier

    datetimes = df[list(cols.keys())].apply(pd.to_datetime, unit = 'ms', errors='coerce') # with errors='coerce', invalid parsing will be set as NaT
    tz_ = timezone(tz_)
    for col in cols:
        df[cols[col]] = datetimes[col].apply(lambda t: t.tz_localize(UTC, 
                                                                     ambiguous='NaT', 
                                                                     errors='coerce').astimezone(tz_))

In [None]:
convert_timezone(data, 'US/Pacific', {'timestamp':'timestamp_PST'})
data.set_index("timestamp_PST", inplace=True)
data = data.tz_localize(None)
print(data.shape)
data

In [None]:
data.tz_localize(None)

In [None]:
from_ = np.array((1516521600, 1516780800))
to_ = np.array((1516780799, 1517039999))
timeranges = np.column_stack((from_,to_))
timeranges

In [None]:
timeranges = getDaywiseSplitsForEpoch("night")
timeranges

In [None]:
additional_arguments_file = '/Users/yasaman/UWEXP/script-input/sensors/location-feature-additionalargs.json'
with open(additional_arguments_file, 'r') as fileObj:
    arguments = json.load(fileObj)
periods = arguments['on_site_periods']

In [None]:
periodranges = np.ndarray(shape=(len(periods), 2), dtype=np.int64)
for index, period in enumerate(periods):
    start = period['start']
    start = datetime.datetime(start['year'], 
                              start['month'], 
                              start['day'], 
                              start['hour'], 
                              start['minute'], 
                              start['second'])
    start = time.mktime(start.timetuple())
    end = period['end']
    end = datetime.datetime(end['year'],
                            end['month'], 
                            end['day'], 
                            end['hour'], 
                            end['minute'], 
                            end['second'])
    end = time.mktime(end.timetuple())
    periodranges[index, 0] = start
    periodranges[index, 1] = end
periodranges

In [None]:
# YSS
def in_range(range_:pd.Series, df:pd.DataFrame)->pd.Series:
    """\
    Returns the boolean indices of rows in dataframe df that fall within range_ (inclusive)
    """
    ind  = (df['timestamp'] >= range_['from']) & (df['timestamp'] <= range_['to'])
    return ind

# YSS
def timerange_filter(df:pd.DataFrame, timeranges:np.ndarray)->pd.DataFrame:
    """\
    Returns the filter that can be used to get all the rows of dataframe df where 
    timestamp column falls in timeranges, a 2D NumPy array with start time in the
    first column and end time in the second column.
    """

    timeranges = pd.DataFrame(timeranges, columns=['from', 'to'])
    inds = timeranges.apply(lambda x : in_range(x, df), axis = 1)
    return inds.T.any(axis=1)

In [None]:
epoch_filter = timerange_filter(data, timeranges * 1000)

In [None]:
period_filter = timerange_filter(data, periodranges * 1000)

In [None]:
def badd_data(df:pd.DataFrame) -> pd.DataFrame:
    """\
    Returns the total and average frqeuncy each bluetooth address appears 
    as well as the number of days that address appears. Total and average
    are obtained from the daily frequency values.
    """

    # find the daily frequnecy of each bluetooth address
    freq_table = df.groupby(by=[pd.to_datetime(df['timestamp'], unit='ms').dt.date, 'bt_address'], 
                            as_index=False).size().reset_index()
    freq_table.rename({'timestamp':'date', 0:'freq'}, axis="columns", inplace=True)
    
    # find the total and average frequency as well as the number of days 
    # of each bluetooth address and store them in baddress_freq_data
    baddress_freq_data = freq_table.groupby(by='bt_address').agg([np.mean, np.size, np.sum]).reset_index()
    baddress_freq_data.columns = ['bt_address', 'avgfreq', 'numdays', 'freq']
    baddress_freq_data.sort_values(by=['freq'], ascending=False, inplace=True)
    return baddress_freq_data

In [None]:
badd_data(data)

In [None]:
ANDROID_WALKING_NAME = 'walking'
ANDROID_RUNNING_NAME = 'running'
ANDROID_ON_FOOT_NAME = 'on_foot'
ANDROID_IN_VEHICLE_NAME = 'on_bicycle'
ANDROID_ON_BICYCLE_NAME = 'in_vehicle'
ANDROID_STILL_NAME = 'still'
ANDROID_TILTING_NAME = 'tilting'
ANDROID_UNKNOWN_NAME = 'unknown'
ANDROID_WALKING_TYPE = 7
ANDROID_RUNNING_TYPE = 8
ANDROID_ON_FOOT_TYPE = 2
ANDROID_IN_VEHICLE_TYPE = 0
ANDROID_ON_BICYCLE_TYPE = 1
ANDROID_STILL_TYPE = 3
ANDROID_TILTING_TYPE = 5
ANDROID_UNKNOWN_TYPE = 4

def androidize_iOSactivity(row):
    """\
    Returns the activiy_type and activity_name information for a given row in 
    iOS activity table so iOS tables can be processed using the same library 
    functions developed for Android activity. Note that if there are more than
    a single activity labeled (e.g. in UW phase I we have records with both 
    stationary = 1 and automotiva = 1), the one higher up here is considered. 
    That is: walking > running > cycling > automotive > stationary > unknown
    """
    if row['walking'] == 1:
        return (ANDROID_WALKING_TYPE, ANDROID_WALKING_NAME)
    if row['running'] == 1:
        return (ANDROID_RUNNING_TYPE, ANDROID_RUNNING_NAME)
    if row['cycling'] == 1:
        return (ANDROID_ON_BICYCLE_TYPE, ANDROID_ON_BICYCLE_NAME)
    if row['automotive'] == 1:
        return (ANDROID_IN_VEHICLE_TYPE, ANDROID_IN_VEHICLE_NAME)
    if row['stationary'] == 1:
        return (ANDROID_STILL_TYPE, ANDROID_STILL_NAME)
    if row['unknown'] == 1:
        return (ANDROID_UNKNOWN_TYPE, ANDROID_UNKNOWN_NAME)
    return (None, None)

In [None]:
data = data[data['activities'].notnull()]
data.head()

In [None]:
#data[['activity_type', 'activity_name']] = data.apply(lambda x: androidize_iOSactivity(x), 
#                                                      axis=1).apply(pd.Series)
androidized_columns = data.apply(lambda x: androidize_iOSactivity(x), axis=1).apply(pd.Series)
data.loc[:, 'activity_type'] = androidized_columns[0]
data.loc[:, 'activity_name'] = androidized_columns[1]
data.head()

In [None]:
data['activity_type'].unique()

In [None]:
data['activity_name'].unique()

In [None]:
def highest_confidence_and_priority(df, priority_list, columns):
    """\
    Returns the desired columns of the row of df with the highest confidence
    and priority. The larger the confidence value, the more confident and the
    larger the index on the priority list, the higher the priority
    """
    df = df.copy()
    df['activity_name'] = pd.Categorical(df['activity_name'], priority_list)
    df = df.sort_values(by=['confidence', 'activity_name'], ascending=[False, False])
    return df[columns].iloc[0]

In [None]:
data_ = data.copy()
activity_priority_list = [ANDROID_WALKING_NAME, 
                          ANDROID_RUNNING_NAME, 
                          ANDROID_ON_FOOT_NAME, 
                          ANDROID_IN_VEHICLE_NAME, 
                          ANDROID_ON_BICYCLE_NAME, 
                          ANDROID_STILL_NAME, 
                          ANDROID_TILTING_NAME, 
                          ANDROID_UNKNOWN_NAME]
columns_to_keep = list(data.columns)
columns_to_keep.remove('timestamp')
data = data.groupby('timestamp').apply(highest_confidence_and_priority, 
                                       activity_priority_list[::-1],
                                       columns_to_keep).reset_index()[data.columns]

In [None]:
df = pd.DataFrame({
                    'activity_type' : ['still', 
                                       'running', 
                                       'walking',
                                       'in_vehicle',
                                       'still', 
                                       'tilting',
                                       'walking',
                                       'in_vehicle',
                                       'unknown',
                                       'running',
                                       'on_bicycle',
                                       'on_foot',
                                       'unknown',
                                       'on_bicycle',
                                       'tilting',
                                       'on_foot'
                                       ],
                    'confidence' : [2,
                                    1,
                                    2,
                                    1,
                                    1,
                                    1,
                                    1,
                                    2,
                                    2,
                                    2,
                                    1,
                                    2,
                                    1,
                                    2,
                                    2,
                                    1],
                    'timestamp': [0,
                                  0, 
                                  0, 
                                  0, 
                                  0, 
                                  0,
                                  0,
                                  0,
                                  0,
                                  0, 
                                  0, 
                                  0, 
                                  0, 
                                  0,
                                  0,
                                  0],
                  })
df

In [None]:
activity_priority_list = [ANDROID_WALKING_NAME, 
                          ANDROID_RUNNING_NAME, 
                          ANDROID_ON_FOOT_NAME, 
                          ANDROID_IN_VEHICLE_NAME, 
                          ANDROID_ON_BICYCLE_NAME, 
                          ANDROID_STILL_NAME, 
                          ANDROID_TILTING_NAME, 
                          ANDROID_UNKNOWN_NAME]
df['activity_name'] = pd.Categorical(df['activity_type'], activity_priority_list[::-1])
df.sort_values(by=['confidence', 'activity_name'], ascending=[False, False])

In [None]:
df_ = df.copy()
columns_to_keep = list(df.columns)
columns_to_keep.remove('timestamp')
df = df.groupby('timestamp').apply(highest_confidence_and_priority, 
                                   activity_priority_list[::-1],
                                   columns_to_keep).reset_index()[df.columns]
df

In [None]:
def most_reliable(df, columns):
    """\
    Returns the desired columns of the most important row of df. The importance
    is defined as (1) being inferred as conversation (-1) > voice (1) > noise (2), 
    (2) having the highest energy / amplitude, (3) having the latest start time (only 
    applicable to conversations), (4) having the earliest end time (only applicable 
    to conversations).
    """
    df = df.sort_values(by=['inference', 
                            'double_energy',
                            'double_convo_start',
                            'double_convo_end'], 
                        ascending=[True, 
                                   False,
                                   False,
                                   True])
    return df[columns].iloc[0]

In [None]:
data_ = data.copy()
columns_to_keep = list(data.columns)
columns_to_keep.remove('timestamp')
data = data.groupby('timestamp').apply(most_reliable, 
                                       columns_to_keep).reset_index()[data.columns]
data.rename(index=str, columns={'double_energy' : 'energy', 
                                'double_convo_start' : 'convo_start', 
                                'double_convo_end' : 'convo_end'}, inplace=True)

In [None]:
records = data_.groupby('timestamp').size()
multi = records[records > 1]

In [None]:
data_[data_['timestamp'].isin(list(multi.keys()))]


In [None]:
data[data['timestamp'].isin(list(multi.keys()))]

In [None]:
print(data.shape[0], '---', data[data['activity_name'].notnull()].shape[0])
data.info()

In [None]:
data.info()

In [None]:
data[data['timestamp'].isin(list(multi.keys()))]['inference'].unique()

In [None]:
def function_test(df):
    print("you'll see a new column with name new_col added to df.")
    print("if your df already has a new_col column its value is going to become 9")
    df['new_col'] = 9

In [None]:
df = pd.DataFrame({
                    'col1' : ['A', 'A', 'B', np.nan, 'D', 'C'],
                    'col2' : [2, 1, 9, 8, 7, 4],
                    'col3': [0, 1, 9, 4, 2, 3],
                  })

In [None]:
print(df.info())
function_test(df)
print(df.info())