# Explore Customer Data
  Should be applicable to all customer data sets,
  Explores categories as well as time series


In [None]:
"""
from sklearn.datasets import fetch_openml
from sklearn import datasets, svm, metrics
from pandas import DataFrame
import matplotlib as mpl
"""

from typing import List, Tuple
from dataclasses import dataclass
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib

import glob
import os
from pathlib import Path

# get nltk and corpus
import nltk
from nltk.corpus import stopwords

# get scapy and corpus
import spacy
import time
from functools import lru_cache
import seaborn as sns
import humanize
import swifter
import dask
import dask.dataframe as dd
from IPython.display import display

In [None]:
# make the plot wider
height_in_inches = 10
matplotlib.rc("figure", figsize=(2 * height_in_inches, height_in_inches))

# Load Data set

In [None]:
# Export Data from healthkit using [qs-access](https://itunes.apple.com/us/app/qs-access/id920297614?mt=8) app
# raw_csv= "/home/idvorkin/data/wamd.all.csv"
raw_csv = "/Users/idvorkin/imessage/all.messages.csv"
cleaned_df_pickled = f"{raw_csv}.pickle"


# Load+Clean+Explore data using Dask as it's got multi-core.
# Then convert to a pandas dataframe pickle.
# df = dd.read_csv(raw_csv,sep='\t' )
# df = df.compute()
# df = pd.read_csv(raw_csv,sep='\t')
df = pd.read_csv(raw_csv, sep="|", lineterminator="\n")

In [None]:
# clean up some  data

# setup date column
df["datetime"] = pd.to_datetime(df.date_uct, errors="coerce")
df = df.set_index(df.datetime)

# setup customer id
df["customer_id"] = df.id

In [None]:
# df = df.compute()
# df.to_pickle(cleaned_df_pickled)
df = pd.read_pickle(cleaned_df_pickled)

# Data Analysis

In [None]:
# gotta be a more elegant way, but doing this for now
distribs = [
    df[c].value_counts(normalize=True).apply(lambda d: d * 100) for c in df.columns
]

In [None]:


def isFlatDistribution(d):
    return len(d) == 0 or d.iloc[0] < 0.01


for d in sorted(
    [d for d in distribs if not isFlatDistribution(d)], key=lambda d: d.iloc[0] * -1
):
    column_header = f"\n------ {d.name} ----- "
    print(column_header)
    print(f"{d.head(10)}")

print("++Flat distribution++")
for d in sorted([d for d in distribs if isFlatDistribution(d)], key=lambda d: d.name):
    c = d.name
    print(c)
print("--Flat distribution--")
df.hist()

# Time series analysis

In [None]:
# Any time series data interesting beyond count()? Perhaps purchaces?
# df.offer_accepted_flg, df.offer_asin_cnt, has_purchase_flg,device_family, device_type_id
# https://www.dataquest.io/blog/tutorial-time-series-analysis-with-pandas/
sns.set(rc={"figure.figsize": (11, 4)})
count_hourly = df.resample("M").count()
count_hourly.iloc[:, 0].plot(title="Interactions over time")

# Customer Distribution Analysis

In [None]:
def plot_distribution_for(df, columns, minimum_call_count=0):
    cid = "customer_id"
    original_customer_by_count = df[cid].value_counts()
    cid_to_exclude = original_customer_by_count[
        original_customer_by_count.values <= minimum_call_count
    ].index.values
    df = df[~df.customer_id.isin(cid_to_exclude)]
    customer_by_count = df[cid].value_counts()
    dfT = (
        customer_by_count.value_counts(normalize=True)
        .apply(lambda d: d * 100)
        .iloc[:columns]
    )
    dfT.index.name = f"% usages by customer"
    N = len(customer_by_count.value_counts())
    graphed = int(dfT.sum())
    title = "What % of time do customers call K times? "
    sub_title = f"Universe customers calling >= {minimum_call_count} times, N={humanize.intcomma(N)}, Visible={graphed}%"
    ax = dfT.plot(kind="bar", title=f"{title}\n{sub_title}")
    ax.set_ylabel(f"% customers")
    plt.show()  # force the plot ot show


plot_distribution_for(df, 3, 0)
plot_distribution_for(df, 10, 3)
plot_distribution_for(df, 10, 10)
plot_distribution_for(df, 10, 500)

# MAU/WAU/DAU analysis

In [None]:
# throw away all customers who only call once.

# for remaining customers group into time index,
# pivot (cidx time_index)
#      | tindex_1 | tindex_2 | tindex_3| sum
# cid1 |
# sum along cid
# if sum matches like clos to column index call them the right value

In [None]:
customer_by_count = df.customer_id.value_counts()
print(f"customer_by_count\n{customer_by_count.head(10)}")
# customer_by_count =  customer_by_count
# df.obf_customer_id.value_counts().head(20)
start_range = 0
irange = range(start_range, start_range + 10)
print(f"customer_in_range\n{customer_by_count.iloc[irange]}")

df_hc = df[df.customer_id.isin(customer_by_count.index[irange].values)]
# count_hourly = df_hc.customer_id.resample("W").count()
# count_hourly = df_hc['2019-01':'2019-05'][["customer_id"]].groupby("customer_id").resample('D').count()
count_hourly
# count_hourly.plot()

## TODO: Head customer behavior
## TODO Middle
## TODO: Tail removal

## Look at head vs tail

# See step functions Called O-10
# Top 10 customers,
# trange = range(0, 3)
# customer_by_count.value_counts(normalize=True).apply(lambda d:d).iloc[trange] # .plot(kind='pie', title=f'% customer {trange}')

In [None]:
# pd.pivot_table?

In [None]:
df_hc

In [None]:
# df_hc.pivot_table([cid]).count()
# df_hc.pivot_table(cid,aggfunc='count')
freq = "M"
cid_by_date = df_hc["2019":].pivot_table(
    values="datetime",
    index=["customer_id"],
    columns=pd.Grouper(freq=freq),
    aggfunc="count",
)
cid_sorted_by_sum = cid_by_date.T.sum().sort_values(ascending=False).index
# cid_by_date = cid_by_date.sort_values("customer_id")
cid_by_date.T[cid_sorted_by_sum[:4]].plot()
# cid_by_date.T.count().sort_values()
# df_hc.pivot_table(index=["customer_id"], columns=pd.Grouper(freq=freq), aggfunc="count")[ "customer_id" ].T.plot(title=f"customer {irange} by freq={freq}", figsize=(12, 8))