In [2]:
import pandas as pd
import numpy as np
import nbimporter
from datetime import datetime, timedelta
import glob
import os
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

# 1. Importing Data

In [3]:
def importing_climate_data(temp_path, temp_file, column_name, date_format):  
    #temp_path =r"C:\Users\Isabell\EMSI\emsi_ml\Wetter\Berlin\df"
    #temp_file = glob.glob(os.path.join(temp_path, "produkt*"))
    temp_file = glob.glob(os.path.join(temp_path, temp_file))
    df_dic = {}
    years_of_interest = [2016, 2018, 2021, 2024]
    #years_of_interest = [2024]
    #print(years_of_interest)
    for file in temp_file:
        df = pd.read_csv(file, delimiter = ";")
        #print(df)
        df["DateTime"] = pd.to_datetime(df[column_name], format= date_format)
        #print(df)
        for year in years_of_interest:
            
            #print(df)
            data = df[df["DateTime"].dt.year == year]
            #data.set_index("DateTime", inplace = True)
            
            #print(data)
            if not data.empty:
                exclude_col = "DateTime"
                data.columns = [station + "_" + col if col != exclude_col else col for col in data.columns]
                df_dic[year] =  data
    
    return df_dic



In [4]:
def importing_football_data(temp_path, temp_file, column_name, *date_format):  

    temp_file = glob.glob(os.path.join(temp_path, temp_file))
    df_dic = {}
    years_of_interest = [2016, 2018, 2021, 2024]
    #years_of_interest = [2024]
    
    for file in temp_file:
        df = pd.read_csv(file, delimiter = ";")
        
        df["DateTime"] = df["Datum"] + " " + df["Uhrzeit (MESZ)"]

        if "EM" in file:
            df["Liga"] = "EM"
        elif "WM" in file:
            df["Liga"] = "WM"
        
        try:
            df["DateTime"] = pd.to_datetime(df[column_name], format= date_format[0])
        except:
            df["DateTime"] = pd.to_datetime(df[column_name], format= date_format[1])

        #print(df)
        for year in years_of_interest:
            
            #print(df)
            data = df[df["DateTime"].dt.year == year]

            if not data.empty:
                df_dic[year] =  data
    
    return df_dic



In [5]:

def importing_electricity_data(temp_path, file_name, date_column, date_format):
    csv_files = glob.glob(os.path.join(temp_path, file_name))
    
    years_of_interest = [2016, 2018, 2021, 2024]
    #years_of_interest = [2024]
    stromverbrauch_list = []
    
    for file in csv_files:
        #folder_path = r"C:\Users\Isabell\EMSI\emsi_ml\Stromverbrauch\Viertel\Realisierter_Stromverbrauch_201506070000_201506100000_Viertelstunde.csv"
        df = pd.read_csv(file, delimiter = ";")
        df["DateTime"] = pd.to_datetime(df[date_column], format = date_format)
    
        for year in years_of_interest:
        
            #print(df)
            data = df[df["DateTime"].dt.year == year]
            data.set_index("DateTime", inplace = True)
            
            #print(data)
            if not data.empty:
                
                df['Gesamt (Netzlast) [MWh] Originalauflösungen'] = df['Gesamt (Netzlast) [MWh] Originalauflösungen'].str.replace('.', '', regex=False)
                df['Gesamt (Netzlast) [MWh] Originalauflösungen'] = df['Gesamt (Netzlast) [MWh] Originalauflösungen'].str.replace(',', '.', regex=False)
                #print(df['Gesamt (Netzlast) [MWh] Originalauflösungen'])
                # Step 2: Convert to numeric
                df['Gesamt (Netzlast) [MWh] Originalauflösungen'] = df['Gesamt (Netzlast) [MWh] Originalauflösungen'].astype(float)
                stromverbrauch_list.append(df)
    
    
    df_tuples = [(df["DateTime"].iloc[0], df) for df in stromverbrauch_list ]
    
    # Sort the list of tuples by the earliest date
    df_tuples_sorted = sorted(df_tuples, key=lambda x: x[0])
    
    elec_dic = {}
    
    # Iterate over the list of tuples
    for dt, df in df_tuples_sorted:  # Correctly unpack: dt is the date, df is the DataFrame
        year = dt.year  # Extract the year from the date
        
        if year not in elec_dic:
            elec_dic[year] = df
        else:
            # Merge the DataFrame with the existing one for the same year
            elec_dic[year] = pd.concat([elec_dic[year], df], ignore_index=True)
    
    # Extract the sorted DataFrames into a list
    sorted_df_list = [df_tuple[1] for df_tuple in df_tuples_sorted]

    return elec_dic



# 2. Manipulating Dataframe

In [29]:
def resampling_data(required_dic, resample_step):
    resampled_dic = {}
    for key, climate_df in required_dic.items():
        try:
            climate_df.set_index("DateTime", inplace=True)
        except:
            pass
            
        numeric_cols = climate_df.select_dtypes(include='number').columns
        df_resampled_numeric = climate_df[numeric_cols].resample(resample_step).mean()
    
        # Resample non-numeric columns
        non_numeric_cols = climate_df.select_dtypes(exclude='number').columns
        df_resampled_non_numeric = climate_df[non_numeric_cols].resample(resample_step).agg(lambda x: x.mode()[0] if not x.mode().empty else x.iloc[0])
    
        # Combine the resampled numeric and non-numeric dataframes
        df_resampled = pd.concat([df_resampled_numeric, df_resampled_non_numeric], axis=1)
    
        resampled_dic[key] = df_resampled

    return resampled_dic

In [6]:
def new_col_match(match_set):
    #print("Hello")
    match_set["Land1"] = match_set["Land1"].str.strip()
    match_set["Land2"] = match_set["Land2"].str.strip()
    match_set = match_set.loc[:, ["Land1", "Land2", "Runde", "Liga", "DateTime"]]
    #print(spielplan)
    grouped = match_set.groupby("DateTime")
    results = pd.DataFrame()

    for name, group in grouped:
        if len(group) == 1:
            results = pd.concat([results, group])
            #print("yes")
        else:
            land1_value = group.iloc[0]['Land1']
            land2_value = group.iloc[0]['Land2']

            new_row = {
                "DateTime" : name,
                'Land1': land1_value,
                'Land2': land2_value,
                "Runde" : group.iloc[1]["Runde"],
                "Liga": group.iloc[1]["Liga"],
                'Land3': group.iloc[1]['Land1'],
                'Land4': group.iloc[1]['Land2'],
                
            }

            new_df = pd.DataFrame([new_row])
            #print(new_df)        
            results = pd.concat([results, new_df], ignore_index = True)
            print(results)
    return results

# 3. Merging Dataframes

In [32]:
def merging_data(electricity_data, football_data, climate_data, unwantend_col_string):
    dfs_year = []
    dfs_year.append(electricity_data)
    dfs_year.append(football_data)
    for data in climate_data:
        dfs_year.append(data)
    
    start_elec = electricity_data["DateTime"].iloc[0]
    end_elec = electricity_data["DateTime"].iloc[-1]

        #print(station[key]["DateTime"])
    #print(dfs_year)
    merged_df = reduce(lambda left, right: pd.merge(left, right, on='DateTime', how='outer'), dfs_year)

    #print(merged_df["Land1"].unique())

    #print(elec)
    spielplan_name_list = football_data.columns.tolist()

    #print(football_dic[key])
    for index, row in football_data.iterrows():
        start_time = row["DateTime"]
        
        until_time = row["DateTime"] + timedelta(minutes=105)
        #print(until_time)
        fill_condition = (merged_df['DateTime']>= start_time) & (merged_df['DateTime'] <= until_time)
        
        merged_df.loc[fill_condition,spielplan_name_list] = merged_df.loc[fill_condition, spielplan_name_list].fillna(method='ffill')


    for string in unwantend_col_string:
        merged_df = merged_df.drop(columns = merged_df.filter(like=string).columns)
        
    merged_df = merged_df[(merged_df["DateTime"] >= start_elec) & (merged_df["DateTime"] <= end_elec)]

    return merged_df
    