In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
import pickle

def create_pipeline():
    numerical_features = ['X', 'Y']
    categorical_features = ['Dates']

    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())
    ])

    return pipeline

def train_pipeline(pipeline, df):
    X = df[['Dates', 'X', 'Y']]
    y = df['Category']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    pipeline.fit(X_train, y_train)

    score = pipeline.score(X_test, y_test)

    with open('pipeline_model.pkl', 'wb') as file:
        pickle.dump(pipeline, file)
    print("Pipeline sauvegardée dans 'pipeline_model.pkl'.")

    return pipeline, score

def make_prediction(pipeline, input_data: dict) -> dict:
    input_df = pd.DataFrame([input_data])

    prediction = pipeline.predict(input_df)
    return {"prediction": prediction[0]}

def load_csv(file_path: str) -> pd.DataFrame:
    df = pd.read_csv(file_path).sample(frac=0.1)
    df['Dates'] = pd.to_datetime(df['Dates']).dt.to_period('M')
    return df


In [18]:
train_pipeline(create_pipeline(), load_csv('data/train.csv'))

Pipeline sauvegardée dans 'pipeline_model.pkl'.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num', StandardScaler(),
                                                   ['X', 'Y']),
                                                  ('cat',
                                                   OneHotEncoder(handle_unknown='ignore'),
                                                   ['Dates'])])),
                 ('classifier', LogisticRegression())]),
 0.21359831444678548)

In [20]:
!ls -lh

total 25352
-rw-r--r--   1 barbaramichaud  staff    13B Nov 26 17:12 README.md
drwxr-xr-x@  6 barbaramichaud  staff   192B Nov 27 13:50 [34mapp[m[m
drwxr-xr-x@  3 barbaramichaud  staff    96B Nov 26 17:14 [34mdata[m[m
drwxr-xr-x@ 10 barbaramichaud  staff   320B Nov 27 12:31 [34mfrontend[m[m
-rw-r--r--   1 barbaramichaud  staff    23K Nov 26 17:12 newplot.png
-rw-r--r--@  1 barbaramichaud  staff   6.4K Nov 27 14:13 pipeline.ipynb
-rw-r--r--@  1 barbaramichaud  staff    50K Nov 27 14:13 pipeline_model.pkl
-rw-r--r--@  1 barbaramichaud  staff   2.3K Nov 27 11:58 requirements.txt
-rw-r--r--@  1 barbaramichaud  staff    12M Nov 26 20:41 sample.ipynb
