<img width="10%" alt="Naas" src="https://landen.imgix.net/jtci2pxwjczr/assets/5ice39g4.png?w=160"/>

# FEC - Création du dataset "Evolution CA"

**Tags:** #fec #finance #snippet #operation #ca #revenue

**Author:** [Florent Ravenel](https://www.linkedin.com/in/florent-ravenel/)

**Description:** Ce notebook permet de créer le dataset pour visualiser l'évolution du CA entre l'année N et l'année N-1.

## Input

### Import libraries

In [None]:
import naas_data_product

### Setup Variables

In [None]:
# Inputs
input_folder_path = "/home/ftp/FEC-engine/outputs/FEC/bdd_fin"
openai_api_key = naas.secret.get(name="OPENAI_API_KEY") or "ENTER_YOUR_OPENAI_API_KEY"

# Outputs
output_folder_path = "/home/ftp/FEC-engine/outputs/FEC/dataset_evolution_ca"
prompt_data = """
Voici le jeu de données sur l'évolution du CA d'une entreprise entre une ou plusieurs années avec:
- les mois dans la colonne DATE
- les valeurs cumulées en colonne VALUE_CUM
- les valeurs décumulées en colonne VALUE.
Peux-tu me donner un bref aperçu factuel de l'évolution du CA et identifier les variations les plus importantes (VALUE)?
Peux-tu commencer ton analyse par "Le CA de l'entreprise a ..."?
"""

## Model

### Récupération du dernier fichier input

In [None]:
df_input = get_last_df(input_folder_path)
print("✅ Row fetched:", len(df_input))
df_input.head(1)

### Création du dataset "EVOLUTION_CA"

In [None]:
def prep_data(df_init, ):
    # Init
    df = df_init.copy()
    df_output = pd.DataFrame()
    
    # Filtre COMPTE_NUM = Chiffre d'Affaire (RUBRIQUE N1)
    df = df[df["COMPTE_NUM"].str.contains(r"^70|^71|^72")]

    # Regroupement
    to_group = [
        "ENTITY",
        "PERIOD",
        "DATE",
    ]
    to_agg = {"VALUE": "sum"}
    df = df.groupby(to_group, as_index=False).agg(to_agg).sort_values(by=["ENTITY", "PERIOD", "DATE"], ascending=[True, False, True])
    
    # Format columns
    df["DATE"] = pd.to_datetime(df["DATE"])
    df["VALUE"] = df["VALUE"].abs()
    
    # Reindex value
    filters = df[["ENTITY", "PERIOD"]].drop_duplicates(ignore_index=True)
    for index, row in filters.iterrows():
        entity = row["ENTITY"]
        period = row["PERIOD"]
        tmp_df = df[(df["ENTITY"] == entity) & (df["PERIOD"] == period)]
        
        # Create date range
        d_start = tmp_df.loc[tmp_df.index[0], "DATE"]
        start_year = d_start.year
        start_month = d_start.month
        start = f'{start_year}-{start_month}-01'
        d_end = tmp_df.loc[tmp_df.index[-1], "DATE"]
        end = d_end
        idx = pd.date_range(start, end, freq="D")
        
        # Reindex by DATE
        tmp_df.set_index("DATE", drop=True, inplace=True)
        tmp_df.index = pd.DatetimeIndex(tmp_df.index)
        tmp_df = tmp_df.reindex(idx, fill_value=0)
        
        # Enforce data
        tmp_df["ENTITY"] = entity
        tmp_df["PERIOD"] = period
        tmp_df.insert(loc=2, column="DATE", value=pd.DatetimeIndex(tmp_df.index))
        
        # Groupby analytics
        tmp_df["DATE"] = tmp_df["DATE"].dt.strftime("%Y-%m")
        tmp_df = tmp_df.groupby(to_group, as_index=False).agg(to_agg)
        tmp_df["VALUE_CUM"] = tmp_df["VALUE"].cumsum()
        
        # Concat
        df_output = pd.concat([df_output, tmp_df])
    return df_output.reset_index(drop=True)

data_prep = prep_data(df_input)
print("Row fetched:", len(data_prep))
data_prep.head(12)

### Création des analyses N et N-1 par période

In [None]:
def create_data_ca(df_init):
    # Init
    df = df_init.copy()
    df_output = pd.DataFrame()
    
    # Get periods by entity
    entities = df[["ENTITY"]].drop_duplicates(ignore_index=True)
    for index, row in entities.iterrows():
        entity = row["ENTITY"]
        tmp_df = df.copy()
        tmp_df = tmp_df[(tmp_df["ENTITY"] == entity)]
        
        # Get periods
        periods = tmp_df[["PERIOD"]].drop_duplicates(ignore_index=True)
        
        for i, r in periods.iterrows():
            period = periods["PERIOD"][i]
            if i < len(periods) - 1:
                period_comp = periods["PERIOD"][i+1]
            else:
                period_comp = None
            print(f"➡️ Periode: {period_comp} -> {period}")
            tmp_df2 = tmp_df[(tmp_df["PERIOD"].isin([period, period_comp]))].reset_index(drop=True)
            print("Row fetched:", len(tmp_df2))
            tmp_df2.loc[tmp_df2["PERIOD"] == period, "GROUP"] = "N"
            tmp_df2.loc[tmp_df2["PERIOD"] == period_comp, "GROUP"] = "N-1"
            tmp_df2["PERIOD"] = period
            analysis = get_ia_analysis(openai_api_key, tmp_df2, prompt_data)
            tmp_df2["AI_ANALYSIS"] = analysis
            
            # Concat
            df_output = pd.concat([df_output, tmp_df2])
            
    # Column to be displayed
    df_output["LABEL"] = pd.to_datetime(df_output["DATE"], format="%Y-%m").dt.strftime("%m")
    df_output["VALUE_D"] = (df_output["VALUE"] / 1000).map("{:,.1f} k€".format).str.replace(",", " ")
    df_output["VALUE_CUM_D"] = (df_output["VALUE_CUM"] / 1000).map("{:,.1f} k€".format).str.replace(",", " ")
    return df_output.reset_index(drop=True)

df_evolution_ca = create_data_ca(data_prep)
print("Row fetched:", len(df_evolution_ca))
df_evolution_ca.head(24)

## Output

### Sauvegarde des fichiers en csv

In [None]:
save_df(df_evolution_ca, output_folder_path)