In [30]:
import plotly.graph_objects as go
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [79]:
df = pd.DataFrame()
for y in range(2010, 2021):
  for m in range(1,13):
    if m < 3: # jan e fev == 750
      i = 750
    elif m == 6: # jun == 2000
      i = 2000
    elif m == 12: # dez == 900
      i = 1500
    else:
      i = 1000
    s = pd.DataFrame(data=[{'year': y, 'month': m, 'y': i}])
    df = pd.concat([df, s], ignore_index=True)

df.head(2)

Unnamed: 0,year,month,y
0,2010,1,750
1,2010,2,750


In [80]:
TARGET = 'y'
y = df[TARGET]
X = df.drop(columns=TARGET)

In [81]:
numerical = X.select_dtypes(include=['number', 'boolean']).columns
categorical = X.select_dtypes(exclude=['number', 'boolean']).columns

In [82]:
print(numerical)
print(categorical)

Index(['year', 'month'], dtype='object')
Index([], dtype='object')


In [70]:
X[categorical] = X[categorical].astype('object')
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ds      132 non-null    object
dtypes: object(1)
memory usage: 1.2+ KB


In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42, stratify=y)
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [84]:
train.head(2)

Unnamed: 0,year,month,y
24,2012,1,750
119,2019,12,1500


In [85]:
test.head(2)

Unnamed: 0,year,month,y
92,2017,9,1000
36,2013,1,750


In [86]:
pipeline = Pipeline([
    ('preprocessor', ColumnTransformer(transformers=[
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
            ('encoder', OneHotEncoder(sparse=False))
            
        ]), categorical),
        ('num', SimpleImputer(strategy='mean'), numerical)
    ])),
    ('model', RandomForestClassifier(random_state=42))
])

In [87]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='Missing',
                                                                                 strategy='constant')),
                                                                  ('encoder',
                                                                   OneHotEncoder(sparse=False))]),
                                                  Index([], dtype='object')),
                                                 ('num', SimpleImputer(),
                                                  Index(['year', 'month'], dtype='object'))])),
                ('model', RandomForestClassifier(random_state=42))])

In [88]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         750       1.00      1.00      1.00         5
        1000       0.88      1.00      0.94        22
        1500       1.00      1.00      1.00         3
        2000       0.00      0.00      0.00         3

    accuracy                           0.91        33
   macro avg       0.72      0.75      0.73        33
weighted avg       0.83      0.91      0.87        33



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [91]:
test['Prediction'] = y_pred
test.head(2)

Unnamed: 0,year,month,y,Prediction
92,2017,9,1000,1000
36,2013,1,750,750


In [92]:
test['Probability'] = pipeline.predict_proba(X_test)[:,1]
test.head(2)

Unnamed: 0,year,month,y,Prediction,Probability
92,2017,9,1000,1000,0.99
36,2013,1,750,750,0.1


In [93]:
labels = []
for i, x in enumerate(np.arange(0, 101, 10)):
    if i>0:
        labels.append(f"{previous_x}% to <{x}%")
    previous_x = x
test['Binned probability'] = pd.cut(test['Probability'], len(labels), labels=labels, 
                                    right=False)
test.head(2)                                    

Unnamed: 0,year,month,y,Prediction,Probability,Binned probability
92,2017,9,1000,1000,0.99,90% to <100%
36,2013,1,750,750,0.1,0% to <10%


In [96]:
dfpred = pd.DataFrame()
for y in range(2021, 2023):
  for m in range(1,13):
    pred = pipeline.predict(pd.DataFrame([{'year': y, 'month': m}]))[0]
    s = pd.DataFrame(data=[{'year': y, 'month': m, 'y': pred}])
    dfpred = pd.concat([dfpred, s], ignore_index=True)

dfpred   

Unnamed: 0,year,month,y
0,2021,1,750
1,2021,2,750
2,2021,3,1000
3,2021,4,1000
4,2021,5,1000
5,2021,6,1000
6,2021,7,1000
7,2021,8,1000
8,2021,9,1000
9,2021,10,1000
