In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [8]:
df = pd.read_csv("../data/salesdaily.csv")
# df.head()
# df.info()
df['datum'] = pd.to_datetime(df['datum'])
df = df.sort_values('datum')


In [9]:
medicine_cols = [
    'M01AB','M01AE','N02BA','N02BE',
    'N05B','N05C','R03','R06'
]

long_df = df.melt(
    id_vars=['datum'],
    value_vars=medicine_cols,
    var_name='medicine',
    value_name='sales'
)

long_df = long_df.sort_values(['medicine', 'datum'])


In [10]:
def create_features(group):
    group['last_7d'] = group['sales'].rolling(7).sum()
    group['last_30d'] = group['sales'].rolling(30).sum()
    group['avg_30d'] = group['sales'].rolling(30).mean()
    group['std_30d'] = group['sales'].rolling(30).std()

    group['future_30d'] = (
        group['sales']
        .shift(-30)
        .rolling(30)
        .sum()
    )

    group['increase'] = (
        group['future_30d'] >= group['last_30d'] * 1.2
    ).astype(int)

    return group

long_df = long_df.groupby('medicine', group_keys=False).apply(create_features)
long_df.dropna(inplace=True)


  long_df = long_df.groupby('medicine', group_keys=False).apply(create_features)


In [11]:
features = [
    'last_7d',
    'last_30d',
    'avg_30d',
    'std_30d'
]

X = long_df[features]
y = long_df['increase']


In [13]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    shuffle=False
)

model = RandomForestClassifier(
    n_estimators=300,
    max_depth=8,
    class_weight='balanced',
    random_state=42
)

model.fit(X_train, y_train)

In [15]:
pred = model.predict(X_test)
print(classification_report(y_test, pred))


              precision    recall  f1-score   support

           0       0.79      0.59      0.68      2329
           1       0.38      0.62      0.47       947

    accuracy                           0.60      3276
   macro avg       0.59      0.61      0.58      3276
weighted avg       0.67      0.60      0.62      3276



In [16]:
latest = long_df.groupby('medicine').tail(1)

latest['probability'] = model.predict_proba(
    latest[features]
)[:, 1]

result = latest[['medicine', 'probability']]
result = result.sort_values('probability', ascending=False)

result


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  latest['probability'] = model.predict_proba(


Unnamed: 0,medicine,probability
6287,N02BA,0.608316
4181,M01AE,0.535812
14711,R03,0.491901
16817,R06,0.486629
12605,N05C,0.47581
8393,N02BE,0.46728
10499,N05B,0.428811
2075,M01AB,0.084229


In [17]:
threshold = 0.55

result['action'] = np.where(
    result['probability'] >= threshold,
    'Increase Stock',
    'Normal'
)

result


Unnamed: 0,medicine,probability,action
6287,N02BA,0.608316,Increase Stock
4181,M01AE,0.535812,Normal
14711,R03,0.491901,Normal
16817,R06,0.486629,Normal
12605,N05C,0.47581,Normal
8393,N02BE,0.46728,Normal
10499,N05B,0.428811,Normal
2075,M01AB,0.084229,Normal


In [19]:
import numpy as np

y_prob = model.predict_proba(X_test)[:, 1]


In [20]:
threshold = 0.55
y_pred_custom = (y_prob >= threshold).astype(int)


In [21]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_custom))


              precision    recall  f1-score   support

           0       0.77      0.64      0.70      2329
           1       0.38      0.54      0.44       947

    accuracy                           0.61      3276
   macro avg       0.57      0.59      0.57      3276
weighted avg       0.66      0.61      0.63      3276



In [22]:
from sklearn.metrics import confusion_matrix
import pandas as pd

cm = confusion_matrix(y_test, y_pred_custom)

cm_df = pd.DataFrame(
    cm,
    index=['Actual_No_Increase', 'Actual_Increase'],
    columns=['Pred_No_Increase', 'Pred_Increase']
)

cm_df


Unnamed: 0,Pred_No_Increase,Pred_Increase
Actual_No_Increase,1487,842
Actual_Increase,438,509


In [23]:
test_results = X_test.copy()
test_results['actual'] = y_test.values
test_results['probability'] = y_prob
test_results['predicted'] = y_pred_custom

test_results.sort_values('probability', ascending=False).head(10)


Unnamed: 0,last_7d,last_30d,avg_30d,std_30d,actual,probability,predicted
16283,21.5,75.4,2.513333,2.384972,1,0.926251,1
16284,17.5,75.4,2.513333,2.384972,1,0.92119,1
15210,23.0,73.0,2.433333,2.045741,1,0.920312,1
16277,25.0,65.9,2.196667,2.054848,1,0.919323,1
15209,25.0,72.0,2.4,2.061135,1,0.919233,1
16278,24.0,66.9,2.23,2.042843,1,0.918766,1
15748,18.7,65.8,2.193333,1.288927,0,0.917855,1
15208,24.0,73.0,2.433333,2.062528,1,0.917733,1
16094,20.4,67.3,2.243333,1.257442,0,0.91767,1
13969,23.0,68.0,2.266667,3.004977,1,0.916193,1


In [24]:
test_med = long_df.loc[X_test.index, ['medicine']].copy()
test_med['actual'] = y_test.values
test_med['predicted'] = y_pred_custom

test_med.groupby('medicine').mean()


Unnamed: 0_level_0,actual,predicted
medicine,Unnamed: 1_level_1,Unnamed: 2_level_1
R03,0.307567,0.380797
R06,0.277968,0.431363


In [25]:
import joblib

joblib.dump(model, "medicine_demand_model_v1.joblib")


['medicine_demand_model_v1.joblib']