In [None]:
import pandas as pd
from tqdm import tqdm
import re
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from scipy.spatial import ConvexHull
import altair as alt
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
df = pd.read_csv('data/meet_logs_fake.csv', parse_dates=['Date'],
                 usecols=['Date', 'Meeting Code', 'Participant Identifier', 'Participant Outside Organisation', 'Client Type', 'Organizer Email',
                          'Product Type', 'Duration', 'Participant Name', 'City', 'Country'])
df.dropna(subset=['Duration', 'Participant Identifier'], inplace=True)


df['dayofweek'] = df['Date'].dt.dayofweek
df['hourofday'] = df['Date'].dt.hour

xdf = df[df['Date'] < '2020-03-14']
ydf = df[df['Date'] >= '2020-03-14']

df.head()

In [None]:
def get_profiles(df):
    _df = df[['dayofweek', 'hourofday', 'Duration', 'Client Type', 'Product Type', 'Country',
               'Participant Identifier', 'Organizer Email']]
    profiles = []
    for person in _df['Participant Identifier'].unique():
        pdf = _df[_df['Participant Identifier'] == person].copy()
        is_organizer = (pdf['Organizer Email'] == person).astype(int)
        is_organizer[is_organizer == 0] = -1
        hangouts = pdf[pdf['Product Type'] == 'Classic Hangouts'].index
        is_organizer.loc[hangouts] = 0
        pdf['is_organizer'] = is_organizer
        
        dummies = []
        for cname in ['dayofweek', 'hourofday', 'Client Type', 'Product Type', 'Country', 'is_organizer']:
            dummies.append(pd.get_dummies(pdf[cname], prefix=cname).mean(0))
        p = pd.concat(dummies)
        p['mean_duration'] = pdf['Duration'].mean()
        # p['std_duration'] = pdf['Duration'].std()
        p = p.to_dict()
        p['email'] = person
        profiles.append(p)
    return pd.DataFrame.from_records(profiles)

### What does a user profile contain?

* Call initiation time - Day of the week, minute of the day
* Duration which user usually spends in calls
* Whether user is organizer or participant. {Yes, No, Don't Know} (Can't know if the product is Classic Hangouts)
* Client Type
* Product Type
* Call rating
* Location

### Do all of this pre and post lockdown

### Profile comparison
* Compare normalized value counts of day of week, hour of day, Client Type, Product Type
* Compare duration distributions
* Return a comparison "report"

In [None]:
predf = get_profiles(xdf)
postdf = get_profiles(ydf)
predf.set_index('email', inplace=True, verify_integrity=True)
postdf.set_index('email', inplace=True, verify_integrity=True)

In [None]:
for col in (set(predf.columns) ^ set(postdf.columns)):
    if col not in predf:
        predf[col] = 0
    elif col not in postdf:
        postdf[col] = 0

In [None]:
common = np.intersect1d(predf.index, postdf.index)
X = predf.loc[common]
Y = postdf.loc[common]
X.fillna(value=0, inplace=True)
Y.fillna(value=0, inplace=True)

In [None]:
from sklearn.metrics.pairwise import paired_euclidean_distances

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
dist = paired_euclidean_distances(X.values, Y.values)
dist = MinMaxScaler().fit_transform(dist.reshape(-1, 1)).ravel()
dist = pd.Series(dist, index=X.index)

In [None]:
dist.sort_values(ascending=False).head(10)

In [None]:
def compare_profiles(pid):
    # fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(10, 8))
    return pd.DataFrame({'pre': X.loc[pid], 'post': Y.loc[pid]})

In [None]:
compare_profiles('david75@salinas-gomez.com')

In [None]:
X.to_csv('data/pre.csv')
Y.to_csv('data/post.csv')

In [None]:
dist.hist()

In [None]:
x = compare_profiles('amanda46@bishop.org')

In [None]:
x

In [None]:
x.loc[[c for c in x.index if c.startswith('dayofweek_')]].plot(kind='bar')