In [1]:
import pandas as pd
import numpy as np
import nbimporter
from datetime import datetime, timedelta
import glob
import os
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

# 1. Correlation Matrix

## 1.1 Detecting features by high correlation

In [2]:

def correlation_matrix(endresult_df, categorical_columns, cut_off_limit):
    columns_to_keep = [col for col in endresult_df.columns if not any(excl in col for excl in categorical_columns)]

    df_relevant = endresult_df[columns_to_keep]  
    correlation_matrix = df_relevant.corr()
    
    # Set the size of the figure
    plt.figure(figsize=(10, 8))
    
    # Create the heat map
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    
    # Add a title
    plt.title('Correlation Matrix Heatmap')
    
    # Show the plot
    plt.show()
    
    target_column = "Gesamt (Netzlast) [MWh] Originalauflösungen"
    
    columns_to_drop = correlation_matrix.index[abs(correlation_matrix[target_column]) < cut_off_limit].tolist()
    
    # Drop those columns from df_relevant
    df_filtered = df_relevant.drop(columns=columns_to_drop)
    #print(df_filtered)
    # If you want to replot the correlation matrix for the filtered dataframe
    filtered_correlation_matrix = df_filtered.corr()
    
    plt.figure(figsize=(10, 8))
    
    # Create the heatmap
    sns.heatmap(filtered_correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    
    # Add a title
    plt.title('Filtered Correlation Matrix Heatmap')
    
    # Show the plot
    plt.show()

    df_end = endresult_df.drop(columns=columns_to_drop)
    return df_end

## 1.2 Reducing dimensionality

In [5]:
# Set the correlation threshold
#correlation_threshold = 0.8  # Adjust this threshold as needed
def reducing_dimensionality(df, correlation_threshhold,categorical_columns):
# Calculate the correlation matrix
    columns_to_keep = [col for col in df.columns if not any(excl in col for excl in categorical_columns)]

    df_filtered = df[columns_to_keep]
    correlation_matrix = df_filtered.corr().abs()
    
    # Create a boolean mask to identify the upper triangle of the correlation matrix
    upper_triangle_mask = np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
    
    # Find the pairs of columns with correlation above the threshold
    high_correlation_pairs = [(correlation_matrix.columns[i], correlation_matrix.columns[j])
                              for i, j in zip(*np.where(correlation_matrix > correlation_threshold))
                              if upper_triangle_mask[i, j]]
    
    # Identify the columns to drop
    columns_to_drop = set([pair[1] for pair in high_correlation_pairs])
    
    # Drop the highly correlated columns
    df_reduced = df_filtered.drop(columns=columns_to_drop)
    
    # If you want to replot the correlation matrix for the reduced dataframe
    reduced_correlation_matrix = df_reduced.corr()
    
    # Set the size of the figure
    plt.figure(figsize=(10, 8))
    
    # Create the heatmap
    sns.heatmap(reduced_correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    
    # Add a title
    plt.title('Reduced Correlation Matrix Heatmap')
    
    # Show the plot
    plt.show()

    df_end = df.drop(columns=columns_to_drop)
    print(df_end)
    return df_end

# 2. Encoding categorical data

## 2.1 Ordinal encoding

In [6]:
def ranking_countries_by_viewership(football_dic_copy):
    football_match = []
    for key, match_set in football_dic_copy.items():
        
        match_set["Land1"] = match_set["Land1"].str.strip()
        match_set["Land2"] = match_set["Land2"].str.strip()
        try:
            match_set["Einschaltquote (Deutschland)"] = match_set["Einschaltquote (Deutschland)"].str.strip(" Mio.")
            match_set["Einschaltquote (Deutschland)"] =  match_set["Einschaltquote (Deutschland)"].str.replace(',', '.', regex=False)
        
            match_set["Einschaltquote (Deutschland)"] = pd.to_numeric(match_set["Einschaltquote (Deutschland)"], errors='coerce')
        
            football_match.append(match_set)
        except:
            pass
        # Flatten the DataFrame by melting it, so each country gets its own row
    
    #print(football_match)
    result_df = pd.concat(football_match, ignore_index = True)
    
    df_melted = pd.melt(result_df, id_vars=["Einschaltquote (Deutschland)"], value_vars=["Land1", "Land2"],
                    var_name='country_position', value_name='country')
    
    # Group by country and sum the views
    country_views = df_melted.groupby('country')["Einschaltquote (Deutschland)"].sum().reset_index()
    
    # Rank the countries by total views
    view_df = country_views.sort_values(by="Einschaltquote (Deutschland)", ascending=False).reset_index(drop = True)
    view_df["Rank"] = view_df.index + 1
    #print(view_df)
    
    # Create a dictionary mapping country names to their ranks
    country_to_rank = dict(zip(view_df['country'], view_df['Rank']))

    return country_to_rank
    
    #print(match_set)