In [None]:
import pandas as pd 
import matplotlib.pyplot as plt 
import plotly as pltly
import numpy as np 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import gc 
import plotly.express as px

In [None]:

def read_whole_frame() : 

    data_frame = pd.read_csv("LUMEN_DS.csv",sep='|',header=0,encoding = 'utf_16_le')
    data_frame.drop(columns="sales_channel_grouping",inplace = True)
    return data_frame

In [None]:
data_frame = read_whole_frame()
customer_frame = data_frame[["customer_id","top_customer_group","customer_region"
                             ,"customer_industry","customer_first_invoice_date"]].copy()
customer_frame.drop_duplicates() 

del data_frame
gc.collect()

In [None]:
# Dataset without outliers for better visualization 
def remove_outliers (feature,dataset): 

    Q1 = dataset[feature].quantile(0.25)
    Q3 = dataset[feature].quantile(0.75)

    IQR = Q3-Q1

    indices = dataset[dataset[feature]>Q3+4*IQR].index
    
    dataset.drop(dataset[dataset[feature]>Q3+4*IQR].index,inplace=True)
    
    #indices = dataset[dataset[feature]<Q1 - 1.5*IQR].index
    
    #dataset.drop(dataset[dataset[feature]<Q1-1.5*IQR].index,inplace=True) 
    
    
    
    return dataset


In [None]:
###plot star customers per customer industry 

fig = make_subplots(rows=1,cols = 1,subplot_titles = ["customer industry"]) 

fig.append_trace(go.Histogram(histnorm="percent",  x=customer_frame["customer_industry"][customer_frame["top_customer_group"]=="OTHER"],marker={'color':'red'}, name="OTHER",legendgroup="group1",nbinsx=4),1,1)
fig.append_trace(go.Histogram(histnorm="percent",  x=customer_frame["customer_industry"][customer_frame["top_customer_group"]=="STAR"],marker={'color':'yellow'} ,name="STAR",legendgroup="group1",nbinsx=4),1,1)

fig.show()
del customer_frame 
gc.collect()

In [None]:
data_frame = read_whole_frame()
customer_frame = data_frame[["customer_id","top_customer_group","customer_region",
                             "customer_industry","customer_first_invoice_date","order_num","invoice_date"]].copy()
customer_frame.drop_duplicates() 

del data_frame
gc.collect()


In [None]:
customer_frame["invoice_date"] = pd.to_datetime(customer_frame["invoice_date"],format='%Y-%m-%d')
customer_frame["invoice_ym"] = customer_frame['invoice_date'].dt.strftime('%Y-%m')

In [None]:
counted_orders = customer_frame[["customer_id","top_customer_group","order_num","invoice_ym"]].copy()
counted_orders.drop_duplicates(inplace=True)

counted_orders = counted_orders.groupby(by=["customer_id","top_customer_group","invoice_ym"]).count().reset_index()

counted_orders 


In [None]:
counted_orders[counted_orders["order_num"] ==0 ]

In [None]:
counted_orders = counted_orders[["top_customer_group","invoice_ym","order_num"]].copy() 

counted_orders = counted_orders.groupby(by=["top_customer_group","invoice_ym"]).median().reset_index()
counted_orders 



In [None]:
fig = px.line(counted_orders, x="invoice_ym", y="order_num", color='top_customer_group',
              title="Median number of orders for customer groups per month")
fig.show()

In [None]:
counted_orders = customer_frame[["customer_id","top_customer_group","order_num","invoice_ym"]].copy()
counted_orders.drop_duplicates(inplace=True)

counted_orders = counted_orders.groupby(by=["customer_id","top_customer_group","invoice_ym"]).count().reset_index()

counted_orders = counted_orders[["top_customer_group","invoice_ym","order_num"]].copy() 

counted_orders = counted_orders.groupby(by=["top_customer_group","invoice_ym"]).mean().reset_index()
counted_orders 



In [None]:
fig = px.line(counted_orders, x="invoice_ym", y="order_num", color='top_customer_group',
              title="Mean number of orders for customer groups per month")
fig.show()

In [None]:
customer_frame[customer_frame["customer_id"]==-99]

In [None]:
del counted_orders 
del customer_frame 
gc.collect() 

In [None]:
data_frame = read_whole_frame()
data_frame

In [None]:
data_frame["revenue"] = (data_frame["invoiced_price"] - data_frame["cost_of_part"])*data_frame["ordered_qty"]
data_frame["revenue"][1294957]


In [None]:
data_frame["invoice_date"] = pd.to_datetime(data_frame["invoice_date"],format='%Y-%m-%d')
data_frame["invoice_ym"] = data_frame['invoice_date'].dt.strftime('%Y-%m')

revenue_frame = data_frame[["customer_id","top_customer_group","order_num","revenue","invoice_ym"]].copy() 


revenue_frame = revenue_frame.groupby(by=["customer_id","top_customer_group","order_num","invoice_ym"]).sum().reset_index()
revenue_frame = revenue_frame[["customer_id","top_customer_group","invoice_ym","revenue"]].copy() 

revenue_frame = revenue_frame.groupby(by=["customer_id","top_customer_group","invoice_ym"]).mean().reset_index() 
revenue_frame = revenue_frame[["top_customer_group","invoice_ym","revenue"]].copy() 

revenue_frame = revenue_frame.groupby(by=["top_customer_group","invoice_ym"]).median().reset_index() 
revenue_frame



In [None]:
del data_frame 
gc.collect() 

In [None]:
fig = px.line(revenue_frame, x="invoice_ym", y="revenue", color='top_customer_group',
              title="Median revenue of customers per month")
fig.show()

In [None]:
from datetime import datetime 

data_frame = read_whole_frame()



In [None]:
from datetime import date
is_weekday = []
data_frame["invoice_date"]

for inv_date in data_frame["invoice_date"]:
    tmp = datetime. strptime(inv_date, '%Y-%m-%d')
    tmp = tmp.weekday()
    if tmp < 5 : 
        is_weekday.append("working day")
    else : 
        is_weekday.append("weekend")


In [None]:
data_frame["working_day"] = is_weekday
data_frame["working_day"].describe()

In [None]:
is_weekday=[]
for inv_date in data_frame["order_date"]:
    tmp = datetime. strptime(inv_date, '%Y-%m-%d')
    tmp = tmp.weekday()
    if tmp < 5: 
        is_weekday.append("working day")
    else : 
        is_weekday.append("weekend")
data_frame["working_day_ordered"] = is_weekday 

In [None]:
data_frame[(data_frame["gm"]>=0)&(data_frame["gm"] <=1)
                       &(data_frame["invoiced_qty_shipped"]>0) &
                       (data_frame["invoiced_price"]>0)
                       &(data_frame["ordered_qty"]>0)&(data_frame["working_day"]=="weekend")]

In [None]:
#data_frame[]

fig = px.box(data_frame[(data_frame["gm"]>0)&(data_frame["gm"] <=1)
                       &(data_frame["invoiced_qty_shipped"]>0) &
                       (data_frame["invoiced_price"]>0)
                       &(data_frame["ordered_qty"]>0)],x="working_day",y="gm",width=800,height=600
                     ,title="GM on working days and weekends using invoiced date")
fig.show()




In [None]:
#data_frame[]



fig = px.box(data_frame[(data_frame["gm"]>=0)&(data_frame["gm"] <=1)
                       &(data_frame["invoiced_qty_shipped"]>0) &
                       (data_frame["invoiced_price"]>0)
                       &(data_frame["ordered_qty"]>0)],x="working_day_ordered",y="gm"
             ,title="GM on working days and weekends",width=800,height =600)
fig.show()




In [None]:
#data_frame[]

fig = px.box(data_frame[(data_frame["gm"]>0)&(data_frame["gm"] <=1)
                       &(data_frame["invoiced_qty_shipped"]>0) &
                       (data_frame["invoiced_price"]>0)
                       &(data_frame["ordered_qty"]>0)
                       &(data_frame["product_group"].isin(["PC019","PC003","PC025","PC006","PC022"]))],x="product_group",y="gm",color="working_day",width=1000,height=600
                     ,title="Product groups gm on weekends")
fig.show()




In [None]:
grouped_df = data_frame.groupby(by="item_code")

grouped_df = grouped_df.agg({"price_last_modified_date_in_the_erp": "nunique"}).reset_index()
grouped_df[grouped_df["price_last_modified_date_in_the_erp"] > 1]

In [None]:
fig = make_subplots(rows=1,cols = 2,subplot_titles = ["ordered quantity","invoiced price"]) 

#fig.append_trace(go.Histogram(histnorm="percent",  x=data_frame["manufacturing_region"][data_frame["top_customer_group"]=="OTHER"],marker={'color':'blue'}, name="OTHER",legendgroup="group1",nbinsx=4),1,1)
#fig.append_trace(go.Histogram(histnorm="percent",  x=data_frame["manufacturing_region"][data_frame["top_customer_group"]=="STAR"],marker={'color':'yellow'} ,name="STAR",legendgroup="group1",nbinsx=4),1,1)

fig.add_trace(px.box(data_frame[(data_frame["gm"]>0)&(data_frame["gm"] <=1)
                       &(data_frame["invoiced_qty_shipped"]>0) &
                       (data_frame["invoiced_price"]>0)
                       &(data_frame["ordered_qty"]>0)],x="top_customer_group",y="ordered_qty",facet_row=),1,1)

#fig.append_trace(go.Histogram(histnorm="percent",  x=customer_frame["customer_region"][customer_frame["top_customer_group"]=="OTHER"],marker={'color':'blue'}, name="OTHER",legendgroup="group2"),2,1)
#fig.append_trace(go.Histogram(histnorm="percent",  x=customer_frame["customer_region"][customer_frame["top_customer_group"]=="STAR"],marker={'color':'yellow'} ,name="STAR",legendgroup="group2"),2,1)
fig.add_trace(px.box(data_frame[(data_frame["gm"]>0)&(data_frame["gm"] <=1)
                       &(data_frame["invoiced_qty_shipped"]>0) &
                       (data_frame["invoiced_price"]>0)
                       &(data_frame["ordered_qty"]>0)],x="top_customer_group",y="invoiced_price"),1,2)


fig.update_layout(autosize=False,width = 750,height=750)
fig.show()

In [None]:
data_frame = read_whole_frame()
data_frame = remove_outliers("ordered_qty",data_frame.copy())


In [None]:
import gc 
gc.collect()

In [None]:

fig = px.box(data_frame[(data_frame["gm"]>0)&(data_frame["gm"] <=1)
                       &(data_frame["invoiced_qty_shipped"]>0) &
                       (data_frame["invoiced_price"]>0)
                       &(data_frame["ordered_qty"]>0)],x="top_customer_group",y="ordered_qty",height=500,width=500)
fig.show()

In [None]:
gc.collect()
data_frame = read_whole_frame() 
data_frame[["invoiced_price","ordered_qty"]].quantile(q=0.99)

In [None]:
fig = make_subplots(rows = 1,cols=2)


gc.collect()
data_frame = read_whole_frame()
#data_frame = remove_outliers("ordered_qty",data_frame.copy())
fig.add_trace(go.Box(x=data_frame["top_customer_group"][(data_frame["gm"]>0)&(data_frame["gm"] < 1)
                       &(data_frame["invoiced_qty_shipped"]>0) &
                       (data_frame["invoiced_price"]>0)
                       &(data_frame["ordered_qty"]>0)],y=data_frame["ordered_qty"][(data_frame["gm"]>0)&(data_frame["gm"] <1)
                       &(data_frame["invoiced_qty_shipped"]>0) &
                       (data_frame["invoiced_price"]>0)
                       &(data_frame["ordered_qty"]>0)
                       ],orientation='v',name="ordered quantity"),1,1)

#gc.collect()
data_frame = read_whole_frame()
#data_frame = remove_outliers("invoiced_price",data_frame.copy())

fig.add_trace(go.Box(x=data_frame["top_customer_group"][(data_frame["gm"]>0)&(data_frame["gm"] <1)
                       &(data_frame["invoiced_qty_shipped"]>0) &
                       (data_frame["invoiced_price"]>0)
                       &(data_frame["ordered_qty"]>0)&
                        (data_frame["invoiced_price"]<2000)],y=data_frame["invoiced_price"][(data_frame["gm"]>0)&(data_frame["gm"] <1)
                       &(data_frame["invoiced_qty_shipped"]>0) &
                       (data_frame["invoiced_price"]>0)
                       &(data_frame["ordered_qty"]>0)
                        &(data_frame["invoiced_price"]<2000)],orientation='v',name="invoiced price"),1,2)

fig.update_yaxes(title_text="ordered quantity", row=1, col=1)
fig.update_yaxes(title_text="invoiced price", row=1, col = 2 )
fig.update_yaxes(type="log")
fig.show()
gc.collect()

In [None]:
from datetime import datetime 



data_frame = read_whole_frame() 



In [None]:
#print(data_frame["invoice_date"].max() ) 
#print(data_frame["invoice_date"].min())

date_invoice_max = datetime.strptime(data_frame["invoice_date"].max(),"%Y-%m-%d")
date_invoice_min = datetime.strptime(data_frame["invoice_date"].min(),"%Y-%m-%d")


customer_oldness = data_frame[["customer_id","customer_first_invoice_date"]].groupby(by=["customer_id","customer_first_invoice_date"]).min().reset_index()
customer_oldness["customer_first_invoice_date"]= list(map(lambda x : datetime.strptime(x,"%Y-%m-%d %H:%M:%S"), list(customer_oldness["customer_first_invoice_date"])))
customer_oldness["customer_age"] = list(map(lambda x : round((date_invoice_max-x).days/365), list(customer_oldness["customer_first_invoice_date"])))
customer_oldness.describe()





In [None]:
customer_oldness[customer_oldness["customer_age"] <=3] 

In [None]:
customer_oldness.describe()

In [None]:
customer_oldness["customer_age_group"] = "0" 
customer_oldness["customer_age_group"][customer_oldness["customer_age"] <3 ] = "0-2" 
#customer_oldness["customer_age_group"][(customer_oldness["customer_age"]>=3)&(customer_oldness["customer_age"]<=5)] = "3-5"
customer_oldness["customer_age_group"][(customer_oldness["customer_age"]>=3)&(customer_oldness["customer_age"]<11)] = "3-10"
customer_oldness["customer_age_group"][(customer_oldness["customer_age"]>=11)] = "11-27"
customer_oldness

In [None]:
customer_oldness[customer_oldness["customer_age_group"] == '11-27']

In [None]:
data_frame["customer_age"] = 0 
data_frame["customer_age_group"] = "0"
i=0
for customer_id in customer_oldness["customer_id"] : 
    #print(customer_id)
    #print(customer_oldness["customer_age"][customer_oldness["customer_id"]==customer_id])
    indices = data_frame[data_frame["customer_id"] == customer_id].index
    data_frame["customer_age"][indices] =\
        list(customer_oldness["customer_age"][customer_oldness["customer_id"] == customer_id])[0]
    data_frame["customer_age_group"][indices] =\
        list(customer_oldness["customer_age_group"][customer_oldness["customer_id"] == customer_id])[0]
    #data_frame["customer_age"].unique()
    #print(data_frame[["customer_id","customer_age","customer_age_group"]][data_frame["customer_id"]!=customer_id])


In [None]:
data_frame[["customer_age","customer_age_group"]]

In [None]:
data_frame[["gm","customer_age","customer_age_group"]]

In [None]:
#data_frame[]

fig = px.box(data_frame[(data_frame["gm"]>0)&(data_frame["gm"] <1)
                       &(data_frame["invoiced_qty_shipped"]>0) &
                       (data_frame["invoiced_price"]>0)
                       &(data_frame["ordered_qty"]>0)
                       ],x="customer_age_group",y="invoiced_price",width=1000,height=600
                     ,title="ordered quantity for customer age groups")

fig.update_yaxes(type="log")
fig.show()




In [None]:
customer_frame = data_frame[["customer_id",
                             "customer_industry","order_num","customer_age","invoice_date","customer_age_group"]].copy()
customer_frame.drop_duplicates()



In [None]:
customer_frame["invoice_date"] = pd.to_datetime(customer_frame["invoice_date"],format='%Y-%m-%d')
customer_frame["invoice_ym"] = customer_frame['invoice_date'].dt.strftime('%Y-%m')
customer_frame

In [None]:
counted_orders = customer_frame[["customer_id","customer_age","customer_age_group","order_num","invoice_ym"]].copy()
counted_orders.drop_duplicates(inplace=True)

counted_orders = counted_orders.groupby(by=["customer_id","customer_age_group","invoice_ym"]).count().reset_index()

counted_orders 


In [None]:
counted_orders

In [None]:
counted_orders = counted_orders[["customer_age_group","invoice_ym","order_num"]].copy() 

counted_orders = counted_orders.groupby(by=["customer_age_group","invoice_ym"]).mean().reset_index()
counted_orders 



In [None]:
fig = px.line(counted_orders, x="invoice_ym", y="order_num", color='customer_age_group',
              title="Mean number of orders in age groups over time")
fig.show()

In [None]:
data_frame = read_whole_frame()



data_frame = data_frame[(data_frame["gm"]>0)&(data_frame["gm"] <=1)
                       &(data_frame["invoiced_qty_shipped"]>0) &
                       (data_frame["invoiced_price"]>0)
                       &(data_frame["ordered_qty"]>0)]





In [None]:
data_frame

In [None]:
indices = data_frame.index

problem_indices = indices[data_frame["cost_of_part"]==0]
problem_indices = problem_indices.tolist()

for index in problem_indices: 
    problem_value = data_frame.loc[index,:]
    problem_item_code = problem_value["item_code"]
    last_value_cost_of_part = data_frame[(data_frame["item_code"] == problem_item_code)& 
                                        (data_frame["invoice_date"]<=problem_value["invoice_date"])&
                                        (data_frame["cost_of_part"]!=0)]
    
    if(len(last_value_cost_of_part)>0) : 
        
        max_invoice_date = last_value_cost_of_part["invoice_date"].max()
        
        last_cost = last_value_cost_of_part.groupby(by= "invoice_date").mean().reset_index() 
        
        data_frame.loc[index,['cost_of_part']] = last_cost["cost_of_part"][last_cost["invoice_date"]==max_invoice_date].values[0]
    
        


data_frame[data_frame["cost_of_part"]==0]

In [None]:
data_frame["gm"] = (data_frame["invoiced_price"]-data_frame["cost_of_part"])/data_frame["invoiced_price"]

In [None]:
data_frame.loc[[0]]

In [None]:
from datetime import date
month_no = []
year_no = []
data_frame["invoice_date"]

for inv_date in data_frame["invoice_date"]:
    tmp = datetime. strptime(inv_date, '%Y-%m-%d')
    tmp = tmp  
    
    month_no.append(tmp.month)
    year_no.append(tmp.year)

data_frame["month_no"] = month_no 
data_frame["year_no"] = year_no 

data_frame

In [None]:
df_by_month = data_frame[["month_no","year_no","order_num"]].copy() 
df_by_month.drop_duplicates()


df_by_month = df_by_month.groupby(by=["month_no","year_no"])

In [None]:
df_by_month = df_by_month.agg({"order_num": "nunique"}).reset_index()
df_by_month[["month_no","year_no"]]

In [None]:

import seaborn as sns
df_by_month =df_by_month.groupby(by="month_no").mean().reset_index()

fig = plt.figure(figsize=(10,10))
plt.bar(df_by_month["month_no"],df_by_month["order_num"])
plt.title("Mean number of orders by month",fontsize=15)
plt.xlabel("month in year",fontsize=15) 
plt.ylabel("Number of orders",fontsize=15)
fig.show()

In [None]:
df_by_month = data_frame[["month_no","year_no","gm"]].copy() 

df_by_month = df_by_month.groupby(by=["month_no","year_no"]).mean().reset_index()
df_by_month

In [None]:
df_by_month = data_frame[["month_no","year_no","product_family","ordered_qty"]].copy() 

df_by_month = df_by_month.groupby(by=["month_no","year_no","product_family"]).mean().reset_index()
#df_by_month = df_by_month.groupby(by=["month_no","product_family"]).mean().reset_index()
df_by_month[df_by_month["product_family"]=="PF002"]

In [None]:
data_frame["revenue"] = (data_frame["invoiced_price"] - data_frame["cost_of_part"])*data_frame["ordered_qty"]

data_frame["revenue"]

In [None]:
df_by_month = data_frame[["month_no","year_no","revenue"]].copy() 

df_by_month = df_by_month.groupby(by=["month_no","year_no"]).mean().reset_index()
df_by_month

In [None]:

df_by_month =df_by_month.groupby(by="month_no").mean().reset_index()

fig = plt.figure(figsize=(10,10))
plt.bar(df_by_month["month_no"],df_by_month["revenue"])
plt.title("Mean revenue by month",fontsize=15)
plt.xlabel("month in year",fontsize=15) 
plt.ylabel("Revenue",fontsize=15)
fig.show()

In [None]:
data_frame["invoice_date"] = pd.to_datetime(data_frame["invoice_date"],format='%Y-%m-%d')
data_frame["invoice_ym"] = data_frame['invoice_date'].dt.strftime('%Y-%m')

In [None]:
df_by_month = data_frame[["invoice_ym","revenue"]].copy() 

df_by_month = df_by_month.groupby(by=["invoice_ym"]).mean().reset_index()
df_by_month

In [None]:
fig = plt.figure(figsize=(10,10))
plt.bar(df_by_month["invoice_ym"],df_by_month["revenue"])
plt.title("Mean revenue by month",fontsize=15)
plt.xlabel("month in year",fontsize=15) 
plt.ylabel("Revenue",fontsize=15)
fig.show()