In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pymysql
import getpass

%matplotlib inline

In [2]:
conn = pymysql.connect(host="localhost",
                       port=3306,
                       user="root",
                       passwd=getpass.getpass(),
                       db="olist")

········


In [3]:
q = """
SELECT DISTINCT
    DATE_FORMAT(o.order_purchase_timestamp, "%Y-%m-01") AS month_id,
    oi.seller_id
FROM orders o
    JOIN order_items oi
    ON o.order_id = oi.order_id
ORDER BY 1,2
"""

df = pd.read_sql(q,conn)

In [4]:
df.head()

Unnamed: 0,month_id,seller_id
0,2016-09-01,1554a68530182680ad5c8b042c3ab563
1,2016-09-01,a425f92c199eb576938df686728acd20
2,2016-09-01,ecccfa2bb93b34a3bf033cc5d1dcdc69
3,2016-10-01,011b0eaba87386a2ae96a7d32bb531d1
4,2016-10-01,01cf7e3d21494c41fb86034f2e714fa1


In [5]:
df.dtypes

month_id     object
seller_id    object
dtype: object

In [6]:
df["month_id"] = pd.to_datetime(df["month_id"])

In [None]:
# Filter timerange
df = df[df["month_id"].between("2017-01-01", "2018-08-01", inclusive=True)]

In [None]:
df.head()

In [None]:
sample = df[df["seller_id"].eq("054694fa03fe82cec4b7551487331d74")]
sample

In [None]:
sample = sample.assign(shifted_month_id=sample["month_id"].shift(1))
sample

In [None]:
# identify new
sample = sample.assign(growth_type = np.where(sample["shifted_month_id"].isna(),"new", np.nan))
sample

In [None]:
(sample["month_id"] - sample["shifted_month_id"])/np.timedelta64(1,"M")

In [None]:
sample["month_id"].dt.month

In [None]:
type(sample["month_id"])

In [None]:
type(sample.month_id)

In [None]:
sample

In [None]:
sample = sample.assign(previous_month=sample["month_id"] - pd.offsets.MonthEnd() - pd.offsets.MonthBegin())
sample

In [None]:
sample.loc[sample["growth_type"].isna(), "growth_type"] = np.where(sample["shifted_month_id"].eq(sample["previous_month"]),"repeat", "return")
sample

---

In [None]:
df.head()

In [None]:
df_shifted = df.assign(shifted_month=df["month_id"] + pd.DateOffset(months=1))
df_shifted.head()

In [None]:
df_first = (df
            .groupby("seller_id")
            .min()
            .reset_index()
            .rename(columns={"month_id":"first_month"}))

df_first

In [None]:
df = df.merge(df_first, how="left", on="seller_id")
df.head()

In [None]:
df = df.merge(df_shifted, 
              how="outer", 
              left_on=["month_id", "seller_id"], 
              right_on=["shifted_month", "seller_id"])

df

In [None]:
# remove the rows outside the daterange (after aug 2018)
df = df[~df["shifted_month"].eq(df["shifted_month"].max())].copy()

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
# coalesce month_id
df = df.assign(month_id = df["month_id_x"].combine_first(df["shifted_month"]))
df.head()

In [None]:
df = (df
      .assign(inactive_1m = df["month_id_x"].isna().astype(int))
      .assign(new=df["month_id_x"].eq(df["first_month"]).astype(int))
      .assign(repeat=df["month_id_x"].eq(df["shifted_month"]).astype(int)))

In [None]:
df = df.assign(returning = 1 - df[["inactive_1m", "new", "repeat"]].sum(axis=1))

In [None]:
df.loc[[134]][["inactive_1m", "new", "repeat", "returning"]]

In [None]:
df.loc[[134]][["inactive_1m", "new", "repeat", "returning"]].idxmax(axis=1)

In [None]:
df.loc[[5923]][["inactive_1m", "new", "repeat", "returning"]]

In [None]:
df.loc[[5923]][["inactive_1m", "new", "repeat", "returning"]].idxmax(axis=1)

---

In [None]:
df = df.assign(growth_type = df[["inactive_1m", "new", "repeat", "returning"]].idxmax(axis=1))
df = df[["month_id", "seller_id", "growth_type"]].copy()

In [None]:
df.head()

In [None]:
df_agg = (df
          .groupby(["month_id", "growth_type"])
          .count()
          .reset_index()
          .rename(columns={"seller_id":"active_seller"}))

In [None]:
df_agg

In [None]:
fig, ax = plt.subplots(figsize=(16,9))

sns.lineplot(data=df_agg,
             x="month_id",
             y="active_seller",
             hue="growth_type",
             ax=ax);
