In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option("max_rows", 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

In [3]:
# Read the cleaned file
df = pd.read_csv('preprocessing/SA_clean_withClusterResults.csv')
df.shape

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


(61870, 47)

In [4]:
df.columns

Index(['Purch.Doc.', 'Item', 'Changed On', 'Short Text', 'Material', 'CoCd',
       'Plnt', 'SLoc', 'Matl Group', 'PO Quantity', 'OUn', 'OPU', 'Eq. To',
       'Net Price', 'Per', 'Net Value', 'Gross value', 'Overdel. Tol.',
       'Underdel.Tol.', 'Item.1', 'BUn', 'Non-deductible', 'Price Date',
       'Effective value', 'PTm', 'Net Weight', 'Un', 'Profit Ctr',
       'Gross Weight', 'Volume', 'RShLi', 'MTyp', 'NCM Code', 'Reb.basis',
       'Priority (Material Required Within)', 'Ordered By', 'Approved By',
       'Priority', 'Section', 'Indenter ID', 'Input Tax Credit', 'Order_yr',
       'Order_mnth', 'NumTimesBought', 'SumGrossValue', 'AvgPerUnitCost',
       'Cluster_km'],
      dtype='object')

# Get the cluster names
'1 High Freq High Value','2 Medium Freq Medium Value'

In [5]:
df['Cluster_km'].value_counts(dropna=False)

1.00    21692
0.00    21584
2.00    14829
3.00     1920
NaN      1845
Name: Cluster_km, dtype: int64

In [6]:
col = 'Cluster_km'
conditions = [df[col] == 0, df[col] == 1, df[col] == 2,df[col] == 3 ]
choices     = ['4 Very Low Freq Very Low Value','2 Medium Freq Medium Value','1 High Freq High Value','3 Low Freq Low Value']
df['Cluster_names'] = np.select(conditions, choices, default='5 Not Clustered')
df['Cluster_names'].value_counts(dropna=False)

2 Medium Freq Medium Value        21692
4 Very Low Freq Very Low Value    21584
1 High Freq High Value            14829
3 Low Freq Low Value               1920
5 Not Clustered                    1845
Name: Cluster_names, dtype: int64

In [7]:
mat_in_2_Cluster = pd.DataFrame(df[df['Cluster_names'].isin(['1 High Freq High Value','2 Medium Freq Medium Value'])])['Short Text'].unique()
mat_in_2_Cluster

array(['IB Ross Broiler Finisher Feed',
       'IB Ross Broiler Pre-Starter Feed', 'IB Ross Broiler Starter Feed',
       'Soya Bean - (A)', 'Soya Bean - (MP)', 'Soya Bean',
       'Rice Bran Raw', 'Rice Bran Boiled', 'Maize'], dtype=object)

In [8]:
# Exolore the Insights from Tableau
# # Purchase price variance

In [9]:
df.loc[(df['Short Text'] == 'IB Ross Broiler Finisher Feed') & (df['Order_yr'] == 2019) & (df['Order_mnth'] == 1)]['Net Price'].describe()

count   2751.00
mean      27.68
std        1.60
min       24.26
25%       26.26
50%       27.78
75%       28.90
max       31.64
Name: Net Price, dtype: float64

In [10]:
df.loc[(df['Short Text'] == 'IB Ross Broiler Finisher Feed') & (df['Order_yr'] == 2019) & (df['Order_mnth'] == 2)]['Net Price'].describe()

count   2480.00
mean      29.64
std        1.38
min       26.30
25%       28.94
50%       29.76
75%       30.64
max       32.46
Name: Net Price, dtype: float64

In [11]:
df_pp = df.loc[(df['Short Text'] == 'IB Ross Broiler Finisher Feed') & (df['Order_yr'] == 2019) & (df['Order_mnth'] == 1)]
df_pp.groupby('Plnt')['Net Price'].mean().sort_values(ascending=True).reset_index(name='Mean Net Price')

Unnamed: 0,Plnt,Mean Net Price
0,PS22,25.62
1,PS27,26.42
2,PS36,26.46
3,PS23,26.67
4,4513,26.78
5,PS21,26.83
6,PS37,26.91
7,PS24,27.27
8,4512,27.37
9,4511,27.89


In [12]:
df_pp.groupby('CoCd')['Net Price'].mean().sort_values(ascending=True).reset_index(name='Mean Net Price')

Unnamed: 0,CoCd,Mean Net Price
0,4500,27.26
1,7860,27.69


In [13]:
# For patterns, create date features 

In [14]:
df['Changed On'] =  pd.to_datetime(df['Changed On'], format='%Y-%m-%d')
df['Order_yr'] = df['Changed On'].dt.year
df['Order_mnth'] = df['Changed On'].dt.month
df['Order_wd'] = df['Changed On'].dt.day_name
df['Order_dow'] = df['Changed On'].dt.dayofweek
df['Order_day'] = df['Changed On'].dt.day
df['Week_Number'] = df['Changed On'].dt.isocalendar().week

In [15]:
# Explore Jan 2019

In [16]:
# 2019 Jan
df_jan2019 = df[(df['Order_yr'] == 2019) & (df['Order_mnth'] == 1)]
df_jan2019.shape

(21222, 52)

In [17]:
df_jan2019.groupby('Short Text')['Gross value'].sum().sort_values(ascending=False).reset_index(name='Gross value').head(10)

Unnamed: 0,Short Text,Gross value
0,IB Ross Broiler Finisher Feed,822027774.7
1,IB Ross Broiler Starter Feed,602549009.0
2,B4 IB Ross Feed,599920000.0
3,B1 IB Ross Feed,541907914.05
4,Maize,534026736.84
5,Soya Bean - (MP),533938199.66
6,PL-3,464724456.25
7,B2 IB Ross Feed,444014416.8
8,Feed P1,424874840.15
9,PL-4,393260137.0


In [18]:
# Get by week
df_jan2019[df_jan2019['Short Text'] == 'IB Ross Broiler Finisher Feed'].groupby('Week_Number')['Gross value'].sum()

Week_Number
1   137848522.00
2   194785923.20
3   156349157.50
4   175139338.50
5   157904833.50
Name: Gross value, dtype: float64

In [19]:
# Get avrg value bought by week 
df_jan2019[df_jan2019['Short Text'] == 'IB Ross Broiler Finisher Feed'].groupby('Week_Number')['Net Price'].mean()

Week_Number
1   27.04
2   26.82
3   27.28
4   28.55
5   28.80
Name: Net Price, dtype: float64

In [20]:
# Get avrg value bought by day
df_jan2019[df_jan2019['Short Text'] == 'IB Ross Broiler Finisher Feed'].groupby('Order_day')['Net Price'].mean()

Order_day
1    27.11
2    28.61
3    28.11
4    26.32
5    26.77
7    27.86
8    26.33
9    28.20
10   27.99
11   26.73
12   27.52
14   26.98
15   27.31
16   28.60
17   27.37
19   27.89
21   29.16
22   27.57
23   28.29
24   28.63
25   28.87
28   28.69
29   29.71
30   28.05
31   29.41
Name: Net Price, dtype: float64

In [21]:
# how many time bought by day
df_jan2019[df_jan2019['Short Text'] == 'IB Ross Broiler Finisher Feed'].groupby('Order_day').size()

Order_day
1     114
2      33
3      92
4     218
5       1
7      16
8     331
9      58
10     71
11    210
12      3
14    246
15     79
16     43
17    123
19      2
21    110
22     92
23    119
24    131
25    135
28    141
29     92
30    179
31    112
dtype: int64

In [22]:
df_jan_sum = df_jan2019[df_jan2019['Short Text'] == 'IB Ross Broiler Finisher Feed'].groupby('Order_day',as_index=False).agg(
    # How many order a day 
    NumOrders=('Gross value','count'),
    #NumOrders2=('Purch.Doc.','size'),
    TotalGrossValue=('Gross value','sum'),
    AvgMeanPrice=('Net Price','mean'),
    MinMeanPrice=('Net Price','min'),
    MaxMeanPrice=('Net Price','max'),
        
    )

df_jan_sum

Unnamed: 0,Order_day,NumOrders,TotalGrossValue,AvgMeanPrice,MinMeanPrice,MaxMeanPrice
0,1,114,37098173.0,27.11,24.26,29.18
1,2,33,12485340.5,28.61,27.4,29.21
2,3,92,32505769.0,28.11,26.81,29.56
3,4,218,55454061.5,26.32,24.26,29.3
4,5,1,305178.0,26.77,26.77,26.77
5,7,16,4469320.0,27.86,27.61,28.12
6,8,331,93079814.0,26.33,24.26,29.3
7,9,58,15451511.5,28.2,26.72,29.24
8,10,71,23478320.2,27.99,26.78,29.3
9,11,210,57487057.5,26.73,24.72,29.34


# Variations in price in a Day 

In [23]:
df_jan_sum.sort_values(by=['TotalGrossValue'],ascending =False)

Unnamed: 0,Order_day,NumOrders,TotalGrossValue,AvgMeanPrice,MinMeanPrice,MaxMeanPrice
6,8,331,93079814.0,26.33,24.26,29.3
11,14,246,72817069.0,26.98,24.72,29.34
9,11,210,57487057.5,26.73,24.72,29.34
3,4,218,55454061.5,26.32,24.26,29.3
23,30,179,52983940.0,28.05,26.3,31.28
19,24,131,43684518.0,28.63,26.3,30.43
21,28,141,41584305.0,28.69,26.3,31.64
14,17,123,40293020.5,27.37,24.72,29.9
20,25,135,37651845.5,28.87,26.3,31.64
0,1,114,37098173.0,27.11,24.26,29.18


In [24]:
df_jan_sum.sort_values(by=['AvgMeanPrice'],ascending =True)

Unnamed: 0,Order_day,NumOrders,TotalGrossValue,AvgMeanPrice,MinMeanPrice,MaxMeanPrice
3,4,218,55454061.5,26.32,24.26,29.3
6,8,331,93079814.0,26.33,24.26,29.3
9,11,210,57487057.5,26.73,24.72,29.34
4,5,1,305178.0,26.77,26.77,26.77
11,14,246,72817069.0,26.98,24.72,29.34
0,1,114,37098173.0,27.11,24.26,29.18
12,15,79,25041375.0,27.31,25.72,29.34
14,17,123,40293020.5,27.37,24.72,29.9
10,12,3,819900.0,27.52,26.77,27.89
17,22,92,28727808.0,27.57,24.72,30.28


# Write function for above

In [25]:
def func_calc_monSumm(df,yr,mon,mat):
    dt = df[(df['Order_yr'] == yr) & (df['Order_mnth'] == mon)]
    dSum = dt[dt['Short Text'] == mat].groupby('Order_day',as_index=False).agg(
        # Tot Gross Value
        TotalGrossValue=('Gross value','sum'),
        #Total mat bought 
        ToalQuantity=('PO Quantity','sum'),
        # Num PO Raised
        NumPORaised=('Purch.Doc.','nunique'),
        # Avg Net Price for the day
        AvgNetPrice=('Net Price','mean'),
        # Min Net Prie for the day
        MinNetPrice=('Net Price','min'),
        # Max Net Prie for the day
        MaxNetPrice=('Net Price','max'),


    )
    return(dSum)

In [26]:
dsum = func_calc_monSumm(df,2019,1,'IB Ross Broiler Finisher Feed')
dsum

Unnamed: 0,Order_day,TotalGrossValue,ToalQuantity,NumPORaised,AvgNetPrice,MinNetPrice,MaxNetPrice
0,1,37098173.0,1353650.0,114,27.11,24.26,29.18
1,2,12485340.5,436300.0,33,28.61,27.4,29.21
2,3,32505769.0,1157150.0,92,28.11,26.81,29.56
3,4,55454061.5,2080900.0,218,26.32,24.26,29.3
4,5,305178.0,11400.0,1,26.77,26.77,26.77
5,7,4469320.0,160500.0,16,27.86,27.61,28.12
6,8,93079814.0,3515850.0,331,26.33,24.26,29.3
7,9,15451511.5,547425.0,57,28.2,26.72,29.24
8,10,23478320.2,839740.0,71,27.99,26.78,29.3
9,11,57487057.5,2139700.0,209,26.73,24.72,29.34


In [27]:
dsum = func_calc_monSumm(df,2019,1,'Maize')
dsum

Unnamed: 0,Order_day,TotalGrossValue,ToalQuantity,NumPORaised,AvgNetPrice,MinNetPrice,MaxNetPrice
0,1,31207532.09,1780013.8,86,17.43,16.5,17.91
1,2,22073538.01,1261707.6,64,17.36,16.0,17.7
2,3,20706759.41,1182936.81,64,17.32,15.9,17.7
3,4,30638836.58,1743623.7,85,17.43,15.4,17.7
4,5,21025272.67,1194601.13,51,17.53,16.5,17.7
5,6,18619989.71,1057323.72,42,17.61,17.45,17.7
6,7,53609113.71,3060135.28,154,17.44,16.5,17.7
7,8,45500777.75,2595036.51,116,17.47,16.5,17.7
8,9,36938522.12,2107436.1,100,17.48,16.05,17.65
9,10,35693199.27,2030241.82,91,17.53,17.0,18.0


In [28]:
dsum = func_calc_monSumm(df,2019,1,'Soya Bean - (MP)')
dsum

Unnamed: 0,Order_day,TotalGrossValue,ToalQuantity,NumPORaised,AvgNetPrice,MinNetPrice,MaxNetPrice
0,1,33044740.9,954.96,74,34602.73,34500.0,35000.0
1,2,16420088.42,476.65,39,34511.79,30500.0,35500.0
2,3,23526833.29,675.36,54,34835.56,34600.0,35010.0
3,4,18456111.12,526.39,42,35045.71,34600.0,35750.0
4,5,15659390.06,444.78,38,35168.94,34600.0,36000.0
5,6,38210478.1,1080.5,88,35362.95,34600.0,36000.0
6,7,38481528.86,1085.39,85,35450.94,34450.0,36000.0
7,8,30009417.94,845.83,64,35480.31,35000.0,36000.0
8,9,23154870.88,652.91,48,35467.92,34450.0,36000.0
9,10,16149115.1,453.34,38,35589.47,35350.0,36500.0


# Get Mateirial Group and check 
1500

In [29]:
df_jan2019.columns

Index(['Purch.Doc.', 'Item', 'Changed On', 'Short Text', 'Material', 'CoCd',
       'Plnt', 'SLoc', 'Matl Group', 'PO Quantity', 'OUn', 'OPU', 'Eq. To',
       'Net Price', 'Per', 'Net Value', 'Gross value', 'Overdel. Tol.',
       'Underdel.Tol.', 'Item.1', 'BUn', 'Non-deductible', 'Price Date',
       'Effective value', 'PTm', 'Net Weight', 'Un', 'Profit Ctr',
       'Gross Weight', 'Volume', 'RShLi', 'MTyp', 'NCM Code', 'Reb.basis',
       'Priority (Material Required Within)', 'Ordered By', 'Approved By',
       'Priority', 'Section', 'Indenter ID', 'Input Tax Credit', 'Order_yr',
       'Order_mnth', 'NumTimesBought', 'SumGrossValue', 'AvgPerUnitCost',
       'Cluster_km', 'Cluster_names', 'Order_wd', 'Order_dow', 'Order_day',
       'Week_Number'],
      dtype='object')

In [30]:
df_jan2019['Short Text'].nunique()

2887

In [31]:
df_jan2019['Matl Group'].nunique()

132

In [32]:
df_jan2019['Matl Group'].value_counts()

1500    7580
2003    2720
2353    2559
2211    2128
2006     875
        ... 
0080       1
1606       1
1608       1
2402       1
2703       1
Name: Matl Group, Length: 132, dtype: int64

In [33]:
def func_calc_monSumm2(df,yr,mon,colToFilter,colValue):
    dt = df[(df['Order_yr'] == yr) & (df['Order_mnth'] == mon)]
    dSum = dt[dt[colToFilter] == colValue].groupby('Order_day',as_index=False).agg(
        # Tot Gross Value
        TotalGrossValue=('Gross value','sum'),
        #Total mat bought 
        ToalQuantity=('PO Quantity','sum'),
        # Num PO Raised
        NumPORaised=('Purch.Doc.','nunique'),
        # Avg Net Price for the day
        AvgNetPrice=('Net Price','mean'),
        # Min Net Prie for the day
        MinNetPrice=('Net Price','min'),
        # Max Net Prie for the day
        MaxNetPrice=('Net Price','max'),
    )
    return(dSum)

In [34]:
dsum = func_calc_monSumm2(df,2019,1,'Matl Group','1500')
dsum

Unnamed: 0,Order_day,TotalGrossValue,ToalQuantity,NumPORaised,AvgNetPrice,MinNetPrice,MaxNetPrice
0,1,80508412.0,2844400.0,162,28.52,24.26,31.95
1,2,26642189.1,918770.0,49,28.66,1.5,31.48
2,3,65331599.75,2283045.1,116,163.36,1.5,30250.0
3,4,112417816.5,4092767.92,277,83.11,1.5,31200.0
4,5,2371666.0,93760.0,8,20.83,1.5,28.32
5,6,47020.5,15860.0,2,2.75,2.4,3.1
6,7,14183494.5,491116.84,29,584.05,0.6,31200.0
7,8,194800865.35,7151850.0,445,27.61,1.5,31.6
8,9,40383666.4,1401255.0,77,28.82,1.5,31.5
9,10,54464410.7,1910040.0,110,28.47,0.6,31.6
