In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

steam_data = pd.read_csv(r'C:\Users\Cecilia\Downloads\ironhack\coursework\group work\group project week3\first_project_cfg\data\clean\merged_clean_owen_2.csv')

steam_data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Cecilia\\Downloads\\ironhack\\coursework\\group work\\group project week3\\first_project_cfg\\data\\clean\\merged_clean_owen_2.csv'

In [None]:
# Define a function to update genre columns in the DataFrame
def update_genres(df, genre_list, column_name='genres'):
    """
    This function updates the genres column by adding boolean columns indicating the presence of specific genres.
    It also cleans the 'genres' column by removing the genres found in the list.
    
    Parameters:
    - df: pandas DataFrame containing the data.
    - genre_list: List of genres to check and remove from the 'genres' column.
    - column_name: Name of the column containing the genres (default is 'genres').
    
    Returns:
    - The modified DataFrame with new boolean columns and cleaned 'genres' column.
    """
    for genre in genre_list:
        # Create a new column with boolean value indicating the genre presence
        df[genre] = df[column_name].str.contains(genre, case=False, na=False)
        
        # Remove the genre from the 'genres' column if it exists
        df[column_name] = df[column_name].str.replace(genre, '', case=False, regex=False)
        
    # Clean up any unnecessary commas or spaces in the 'genres' column
    df[column_name] = df[column_name].str.replace(r'\s*,\s*', ', ', regex=True)  # Remove extra spaces around commas
    df[column_name] = df[column_name].str.replace(r'^\s*,|\s*,\s*$', '', regex=True)  # Remove leading/trailing commas

    return df

# List of genres to check and remove
genres_list = ['Indie', 'Early Access', 'Free to Play', 'Simulation', 'Sports', 'Action', 
               'Strategy', 'Adventure', 'RPG', 'Massively Multiplayer', 'Casual', 'Racing', 'Animation & Modeling', 
               'Design & Illustration', 'Education', 'Game Development']

# Apply the function to update the DataFrame
steam_data = update_genres(steam_data, genres_list)

steam_data.head()

In [None]:
#filter for indie games by publisher class
indie_df = steam_data[(steam_data['publisherClass'] == 'Indie')]
indie_df.head()

In [None]:
# Perform the melt operation for the genres of interest
melt_by_genre = pd.melt(indie_df, 
                        id_vars=['name', 'revenue', 'copies_sold'], 
                        value_vars=genres_list[3:],  # Starting from index 3 to avoid 'Indie' and 'Free to Play'
                        var_name='genre', value_name='is_in_genre')

# Filter only the rows where the genre is present (True)
melt_by_genre = melt_by_genre[melt_by_genre['is_in_genre']]

# Generate summary statistics by genre
genre_summary = melt_by_genre.groupby('genre').agg(
    count=('name', 'count'),
    total_revenue=('revenue', 'sum'),
    avg_revenue_per_game=('revenue', 'mean'),
    total_copies_sold=('copies_sold', 'sum'),
    min_revenue=('revenue', 'min'),
    Q1_revenue=('revenue', lambda x: x.quantile(0.25)),
    median_revenue=('revenue', 'median'),
    Q3_revenue=('revenue', lambda x: x.quantile(0.75)),
    max_revenue=('revenue', 'max')
).reset_index()

# Calculate the average revenue per copy sold
genre_summary['avg_rev_per_copy_sold'] = genre_summary['total_revenue'] / genre_summary['total_copies_sold']

# Sort the summary by total revenue
genre_summary = genre_summary.sort_values(by=['total_revenue'], ascending=False)

# Display the final genre summary
print(genre_summary)

In [None]:
indie_by_genre = pd.melt(indie_df, id_vars=['name', 'revenue', 'copies_sold'],
                        value_vars=['Simulation', 'RPG', 'Racing', 'Action',
                                     'Adventure', 'Sports'],
                        var_name='genre', value_name='is_in_genre')

indie_by_genre = indie_by_genre[indie_by_genre['is_in_genre'] == True]

indie_by_genre = indie_by_genre[['name', 'revenue', 'copies_sold', 'genre']]

indie_by_genre

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.colors import Normalize
from matplotlib.cm import ScalarMappable

# Assuming `indie_by_genre` DataFrame is already available
df = indie_by_genre[['name', 'revenue', 'copies_sold']].copy()
df = df.drop_duplicates(subset='name')

# Normalize the 'revenue' and 'copies_sold' columns to apply color mapping
norm_revenue = Normalize(vmin=df['revenue'].min(), vmax=df['revenue'].max())
norm_copies_sold = Normalize(vmin=df['copies_sold'].min(), vmax=df['copies_sold'].max())

# ScalarMappable to use 'viridis' colormap
cmap_revenue = ScalarMappable(norm=norm_revenue, cmap='viridis')
cmap_copies_sold = ScalarMappable(norm=norm_copies_sold, cmap='viridis')

# Apply color to each revenue and copies_sold
df['revenue_color'] = [cmap_revenue.to_rgba(x) for x in df['revenue']]
df['copies_sold_color'] = [cmap_copies_sold.to_rgba(x) for x in df['copies_sold']]

# Plotting Histogram for Revenue with log scale and viridis colormap
plt.figure(figsize=(10, 6))
sns.histplot(df['revenue'], kde=True, bins=30, log_scale=True, color=cmap_revenue.to_rgba(df['revenue'].mean()))  # Apply color based on average
plt.title('Distribution of Game Revenue')

# Save the figure with a white background
plt.savefig(r'C:\Users\Cecilia\Downloads\ironhack\coursework\group work\group project week3\first_project_cfg\figures\game_revenue_distribution_cfg.png', 
            format="png", dpi=300, bbox_inches='tight', transparent=False, facecolor='white')
plt.show()

# Plotting Histogram for Copies Sold with log scale and viridis colormap
plt.figure(figsize=(10, 6))
sns.histplot(df['copies_sold'], kde=True, bins=30, log_scale=True, color=cmap_copies_sold.to_rgba(df['copies_sold'].mean()))  # Apply color based on average
plt.title('Distribution of Copies Sold')

# Save the figure with a white background
plt.savefig(r'..\figures\game_copies_sold_distribution_cfg.png', 
            format="png", dpi=300, bbox_inches='tight', transparent=False, facecolor='white')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.colors import Normalize
from matplotlib.cm import ScalarMappable

# Assuming `indie_by_genre` DataFrame is already available
df = indie_by_genre[['name', 'genre', 'revenue', 'copies_sold']].copy()

# Aggregating total revenue, total copies sold, and count by genre
genre_performance = df.groupby('genre').agg(
    total_revenue=('revenue', 'sum'),
    total_copies_sold=('copies_sold', 'sum'),
    count=('name', 'nunique')  # count of unique games per genre
).sort_values(by='total_revenue', ascending=False)

# Normalize total revenue for applying color scale
norm = Normalize(vmin=genre_performance['total_revenue'].min(), vmax=genre_performance['total_revenue'].max())
cmap = ScalarMappable(norm=norm, cmap='viridis')

# Plotting Bar plot for Total Revenue by Genre with viridis color map
plt.figure(figsize=(12, 6))

# Apply viridis colormap for bar colors based on total_revenue
bars = plt.bar(genre_performance.index, genre_performance['total_revenue'], color=[cmap.to_rgba(x) for x in genre_performance['total_revenue']])

# Add titles and labels
plt.title('Total Revenue by Genre')
plt.xlabel('Genre')
plt.ylabel('Total Revenue')
plt.xticks(rotation=45)

# Save the plot with a white background
plt.savefig(r'..\figures\indie_total_rev_by_genre_cfg.png', format="png", dpi=300, bbox_inches='tight', transparent=False, facecolor='white')
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Check the columns in indie_df to make sure we have the right playtime columns
print(indie_df.columns)

# List of playtime columns to be checked
playtime_columns = ['average playtime forever', 'average playtime two weeks', 
                    'median playtime forever', 'median playtime two weeks']

# Check if the required columns exist in indie_df
missing_columns = [col for col in playtime_columns if col not in indie_df.columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:
    print("All playtime columns are present.")

# Proceed if all columns are present
if not missing_columns:
    # Melt the DataFrame for each playtime column, creating a long-format DataFrame
    melted_df = indie_df.melt(id_vars=['name'], 
                              value_vars=playtime_columns,
                              var_name='playtime_type', 
                              value_name='playtime')

    # Now, we need to map each game to its genres. This is a bit tricky since each game can belong to multiple genres.
    # Assuming you have genre columns where each genre is a separate column, we'll keep track of them.
    genre_columns = ['Early Access', 'Free to Play', 'Simulation', 'Sports', 'Action', 
                     'Strategy', 'Adventure', 'RPG', 'Massively Multiplayer', 'Casual']
    
    # Create a new DataFrame with 'name' and the genre columns
    genre_df = indie_df[['name'] + genre_columns]  # Include the 'name' column here

    # Create a genre column that combines all genre columns (a one-hot encoding style)
    # Convert the genre columns into a single "genre" column with corresponding game names
    genre_melted = genre_df.melt(id_vars=['name'], var_name='genre', value_name='is_genre')

    # Only keep rows where 'is_genre' is 1, i.e., where the game belongs to the genre
    genre_melted = genre_melted[genre_melted['is_genre'] == 1]

    # Merge the playtime data with genre data based on the 'name' column
    merged_df = pd.merge(melted_df, genre_melted[['name', 'genre']], on='name', how='left')

    # Now, let's plot the data for each of the playtime variables for each genre
    plt.figure(figsize=(14, 10))

    # Plot for Average Playtime Forever
    plt.subplot(2, 2, 1)
    sns.barplot(x='genre', y='playtime', hue='playtime_type', data=merged_df[merged_df['playtime_type'] == 'average playtime forever'], palette='viridis')
    plt.title('Average Playtime Forever by Genre')
    plt.xlabel('Genre')
    plt.ylabel('Average Playtime (hours)')
    plt.xticks(rotation=45, ha='right')

    # Plot for Average Playtime 2 Weeks
    plt.subplot(2, 2, 2)
    sns.barplot(x='genre', y='playtime', hue='playtime_type', data=merged_df[merged_df['playtime_type'] == 'average playtime two weeks'], palette='viridis')
    plt.title('Average Playtime (Last 2 Weeks) by Genre')
    plt.xlabel('Genre')
    plt.ylabel('Average Playtime (hours)')
    plt.xticks(rotation=45, ha='right')

    # Plot for Median Playtime Forever
    plt.subplot(2, 2, 3)
    sns.barplot(x='genre', y='playtime', hue='playtime_type', data=merged_df[merged_df['playtime_type'] == 'median playtime forever'], palette='viridis')
    plt.title('Median Playtime Forever by Genre')
    plt.xlabel('Genre')
    plt.ylabel('Median Playtime (hours)')
    plt.xticks(rotation=45, ha='right')

    # Plot for Median Playtime 2 Weeks
    plt.subplot(2, 2, 4)
    sns.barplot(x='genre', y='playtime', hue='playtime_type', data=merged_df[merged_df['playtime_type'] == 'median playtime two weeks'], palette='viridis')
    plt.title('Median Playtime (Last 2 Weeks) by Genre')
    plt.xlabel('Genre')
    plt.ylabel('Median Playtime (hours)')
    plt.xticks(rotation=45, ha='right')

    # Adjust layout for better spacing
    plt.tight_layout()

    # Save the figure with a white background
    plt.savefig(r'..\figures\playtime_by_genre_cfgd.png', format='png', dpi=300, bbox_inches='tight', transparent=False, facecolor='white')
    plt.show()
else:
    print("Please fix missing columns and try again.")
