In [27]:
csv1 = r"..\..\data\raw\Steam_2024_bestRevenue_1500.csv"
csv2 = r"..\..\data\raw\game_api_data.csv"
csv3 = r"..\..\data\raw\final_merged_steamdata.csv"

import pandas as pd
import numpy as np

def merge_3_datasets(csv1, csv2, csv3):
    """ merges the 3 datasets and cleans to appropriate columns
    csv1 must be the original dataset of 1500 rows
    csv2 must be the game_api dataset
    csv3 must be the merged set from the data scraping"""
    
    # Import the data from the csvs as dataframes
    df1 = pd.read_csv(csv1)
    df2 = pd.read_csv(csv2, sep=';')
    df3 = pd.read_csv(csv3)
    
    # Merges the dataframes
    df_merge1 = pd.merge(df1, df2, left_on="steamId", right_on="steam_id", how="left")
    merged_df = pd.merge(df_merge1, df3, left_on="steamId", right_on="steamId", how="left")

    # Removes unnecessary columns, merges duplicate columns, renames columns
    df = clean_cols(merged_df)
    df = further_cleaning(df)
    
    return df

import pandas as pd

def clean_cols(df):
    """
    Cleans and processes a DataFrame by performing the following steps:
    
    1. Drops columns with a large number of missing values.
    2. Renames, fills, and formats specific columns.
    3. Filters and reorders columns for a consistent structure.

    """
    
    # Drop columns with more than 500 missing values
    new_df = df.dropna(axis=1, thresh=len(df) - 500)
    
    # Rename 'name_x' column to 'name'
    new_df = new_df.rename(columns={'name_x': 'name'})
    
    # Fill missing values in 'genres_x' with values from 'genres_y'
    new_df['genres_x'] = df['genres_x'].fillna(df['genres_y'])
    new_df = new_df.rename(columns={'genres_x': 'genres'})
    
    # Fill missing 'publishers_x_x' values with 'publishers_y_y' values and format
    new_df['publishers_x_x'] = new_df['publishers_x_x'].fillna(new_df['publishers_y_y'])
    new_df['publishers_x_x'] = new_df['publishers_x_x'].str.replace(",", ", ").str.replace("  ", " ")
    
    # Format 'developers_x_x' by adding spaces after commas and removing extra spaces
    new_df['developers_x_x'] = new_df['developers_x_x'].str.replace(",", ", ").str.replace("  ", " ")
    # Fill missing 'developers_x_x' values with 'developers_y_y'
    new_df['developers_x_x'] = new_df['developers_x_x'].fillna(new_df['developers_y_y'])

    # Set missing 'required age' values to 0
    new_df['required age'] = new_df['required age'].fillna(0)
    
    # Fill 'required_age' column with 'required age' values and correct 0 values
    new_df['required_age'] = new_df['required_age'].fillna(new_df['required age'])
    new_df.loc[new_df['required_age'] == 0, 'required_age'] = new_df['required age'] 
    
    # Define columns to retain and reorder
    col_list = [
        'name', 'releaseDate_x', 'copiesSold_x', 'price_x_x', 'revenue_x',
        'avgPlaytime_x', 'reviewScore_x', 'publisherClass_x', 'publishers_x_x',
        'developers_x_x', 'steamId', 'required_age', 'is_free', 'description', 
        'languages', 'price_y_x', 'genres', 'categories_x', 'estimated owners', 
        'peak ccu', 'about the game', 'metacritic score', 'user score', 'positive',
        'negative', 'recommendations', 'average playtime forever', 
        'average playtime two weeks', 'median playtime forever', 
        'median playtime two weeks'
    ]
    new_df = new_df[col_list]
    
    # Convert 'releaseDate_x' to datetime format, renaming to 'release_date'
    new_df['releaseDate_x'] = pd.to_datetime(new_df['releaseDate_x'], format='%d-%m-%Y', errors='coerce')
    new_df = new_df.rename(columns={'releaseDate_x': 'release_date', 'revenue_x': 'revenue', 
                                    'avgPlaytime_x': 'avgPlaytime', 'publisherClass_x': 'publisherClass'})
    
    # Rename additional columns for clarity
    new_df = new_df.rename(columns={
        'copiesSold_x': 'copies_sold', 
        'price_x_x': 'price_USD', 
        'price_y_x': 'price_local', 
        'reviewScore_x': 'review_score',
        'publishers_x_x': 'publishers', 
        'developers_x_x': 'developers', 
        'categories_x': 'categories'
    })

    # Define the final column order for the cleaned DataFrame
    cols = [
        'steamId', 'name', 'release_date', 'copies_sold', 'price_USD', 'price_local',
        'revenue', 'avgPlaytime', 'publisherClass', 'publishers', 'developers', 
        'required_age', 'is_free', 'description', 'about the game', 'languages', 
        'genres', 'categories', 'estimated owners', 'peak ccu', 'metacritic score', 
        'review_score', 'user score', 'positive', 'negative', 'recommendations', 
        'average playtime forever', 'average playtime two weeks', 'median playtime forever', 
        'median playtime two weeks'
    ]
    new_df = new_df[cols]

    # Drop rows with missing 'genres' values
    new_df = new_df.dropna(subset=['genres'])
    
    return new_df

def get_genres_from_col(df):

    genre_set = set()
    
    for genre in list(df["genres"].unique()):
        if isinstance(genre, str):
            genre_set.update(genre.split(","))
        
    genre_set = {genre.strip() for genre in genre_set}
    
    return(genre_set)

def get_genre_cols(df, genre_set):
    for genre in genre_set:
        df[genre] = df['genres'].apply(lambda x: genre in [g.strip() for g in str(x).replace(",", ", ").split(", ")] if pd.notna(x) else False)
    return df

def remove_low_genre_columns(df, genre_set, min_true_count=10):
    # Iterate over the columns in the genre_set
    for genre in genre_set:
        # Count the number of True values in the column
        if df[genre].sum() < min_true_count:  # Sum of boolean values gives the count of True values
            # Drop the column if it has fewer than `min_true_count` True values
            df.drop(columns=[genre], inplace=True)
    
    return df

def further_cleaning(df):
    # Add language count column
    df["language_count"] = df['languages'].str.count(",")+1

    col_list = ['steamId', 'name', 'release_date', 'copies_sold', 'price_USD',
       'price_local', 'revenue', 'avgPlaytime', 'publisherClass', 'publishers',
       'developers', 'required_age', 'is_free', 'description',
       'about the game', 'languages', 'language_count', 'genres', 'categories',
       'estimated owners', 'peak ccu', 'metacritic score', 'review_score',
       'positive', 'negative', 'recommendations',
       'average playtime forever', 'average playtime two weeks',
       'median playtime forever', 'median playtime two weeks']

    df = df[col_list]
   
    genre_set = get_genres_from_col(df)

    df = get_genre_cols(df, genre_set)

    df = remove_low_genre_columns(df, genre_set, min_true_count=10)

    cat_set = {'Multi-player', 'Single-player'}
    
    for cat in cat_set:
        df[cat] = df['categories'].apply(lambda x: cat in [g.strip() for g in str(x).replace(",", ", ").split(", ")] if pd.notna(x) else False)

    return df