In [None]:
import os
import json
import matplotlib.pyplot as plt
import scienceplots

plt.style.use('science')
def get_or_create_value(function_to_apply, file_path):
    # Check if the file exists
    if os.path.exists(file_path):
        # Load the value from the file
        with open(file_path, 'r') as file:
            value = json.load(file)
    else:
        # Call the function to create the value
        value = function_to_apply()
        # Save the value to the file
        with open(file_path, 'w') as file:
            json.dump(value, file)
    
    return value

In [None]:
game_data = get_or_create_value(lambda: {}, "game_data_with_vader_sentiment.json")

In [None]:
import re
for game, data in game_data.items():
    match = re.match(r'^\d+', data["categorie"])
    if match:
        data["year"] = int(match.group())
    else:
        raise RuntimeError()
    data.pop("categorie")

In [None]:
from collections import defaultdict
import matplotlib.pyplot as plt
# Plotting

# Example data, including multiple entries per year to demonstrate averaging

# Initialize dictionaries to sum violence scores and count entries per year
violence_sum_per_year = defaultdict(float)
count_per_year = defaultdict(int)

# Sum violence scores and count entries for each year
for game in game_data.values():
    year = game['year']
    violence_sum_per_year[year] += game["sentiment"]['violence']
    count_per_year[year] += 1

# Calculate the average violence score for each year
average_violence_per_year = {year: violence_sum_per_year[year] / count_per_year[year] for year in violence_sum_per_year}

# Sort the data by year
sorted_years = sorted(average_violence_per_year.keys())
average_violences = [average_violence_per_year[year] for year in sorted_years]

# Plotting
plt.figure(figsize=(10, 5))
plt.plot(sorted_years, average_violences, marker='o')

plt.title('Average Violence Scores Over the Years')
plt.xlabel('Year')
plt.ylabel('Average Violence Score')
plt.grid(True)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

from collections import defaultdict
import numpy as np

def calculate_sentiment_statistics(game_data: dict, sentiment_key: str):
    """
    Calculate the average and standard deviation of specified sentiment scores per year from a dataset
    where each entry is a dictionary containing a year and a nested 'sentiment' dictionary with sentiment scores.

    :param game_data: Dictionary of dictionaries with 'year' and nested 'sentiment' dictionary.
    :param sentiment_key: Key for the sentiment score to be calculated (e.g., 'violence', 'compound').
    :return: Two dictionaries with years as keys and average scores and standard deviations as values.
    """
    sentiment_sum_per_year = defaultdict(float)
    count_per_year = defaultdict(int)

    # Sum sentiment scores and count entries for each year
    for game in game_data.values():
        try:
            year = game['year']
            sentiment_score = game["sentiment"][sentiment_key]
            sentiment_sum_per_year[year] += sentiment_score
            count_per_year[year] += 1
        except KeyError as e:
            print(f"Missing key in data: {e}")

    # Calculate the average sentiment score for each year
    average_sentiment_per_year = {year: sentiment_sum_per_year[year] / count_per_year[year]
                                  for year in sentiment_sum_per_year}

    # Calculate standard deviation for each year
    std_dev_per_year = {year: np.std([game["sentiment"][sentiment_key] 
                                      for game in game_data.values() if game['year'] == year])
                        for year in count_per_year}

    return average_sentiment_per_year, std_dev_per_year



def plot_violence_statistics(average_violence, std_deviation):
    """
    Plot the violence statistics.

    :param average_violence: Dictionary of average violence scores per year.
    :param std_deviation: Dictionary of standard deviations per year.
    """
    # Sort the data by year
    sorted_years = sorted(average_violence.keys())
    average_violences = [average_violence[year] for year in sorted_years]
    std_devs = [std_deviation[year] for year in sorted_years]

    # Plotting with enhancements
    plt.figure(figsize=(12, 6))

    # Plot average violence scores with error bars
    plt.errorbar(sorted_years, average_violences, yerr=std_devs, fmt='-o',
                 label='Average with Std Dev', color='blue')

    # Fit and plot a trend line
    z = np.polyfit(sorted_years, average_violences, 10)
    p = np.poly1d(z)
    plt.plot(sorted_years, p(sorted_years), "r--", label='Trend Line')

    # Labels and title
    plt.title('Violence Scores Over the Years with Trend and Variability')
    plt.xlabel('Year')
    plt.ylabel('Violence Score')
    plt.legend()
    plt.grid(True)

    # Show the plot
    plt.show()


average_violence, std_deviation = calculate_sentiment_statistics(game_data, "violence")
plot_violence_statistics(average_violence, std_deviation)


In [None]:
average_compound, std_dev_compound = calculate_sentiment_statistics(game_data, 'compound')
plot_violence_statistics(average_compound, std_dev_compound)

In [None]:
game_data

In [None]:
from collections import Counter


unique_categories = Counter()
counter = 0
for game, content in game_data.items():
        if len(content['country of development']) > 1:
                counter += 1
        unique_categories.update(content['country of development'])

# Display the unique keys
unique_categories, counter


In [None]:
import matplotlib.pyplot as plt
from collections import defaultdict, Counter

# Assuming game_data is a dictionary where the key is the game name and the value is another dictionary
# that includes 'country of development' and 'violence' score within a 'sentiment' sub-dictionary.

# Step 1: Extract relevant data
country_violence_data = defaultdict(list)
for game, details in game_data.items():
    # We assume 'violence' scores are within a 'sentiment' sub-dictionary
    violence_score = details["sentiment"]["violence"]
    for country in details["country of development"]:
        country_violence_data[country].append(violence_score)

# Step 2: Count the number of games per country
game_counts = Counter({country: len(scores) for country, scores in country_violence_data.items()})

# Step 3: Calculate average violence score for each country with at least 100 games or is Denmark
average_violence_scores = {}
for country, violence_scores in country_violence_data.items():
    if (game_counts[country] >= 100 or country == "Denmark") and country != "other":
        average_violence_scores[country] = sum(violence_scores) / len(violence_scores)

# Step 4: Add all other countries to 'Others'
other_violence_scores = []
for country, violence_scores in country_violence_data.items():
    if (game_counts[country] < 100 and country != "Denmark") or "other":
        other_violence_scores.extend(violence_scores)
if other_violence_scores:
    average_violence_scores["Others"] = sum(other_violence_scores) / len(other_violence_scores)


# Calculate overall average violence score
all_scores = [score for scores in country_violence_data.values() for score in scores]
overall_average_violence = sum(all_scores) / len(all_scores)

# Step 5: Plot the average violence scores
countries = sorted(list(average_violence_scores.keys()), key= lambda x: average_violence_scores[x]) 
averages = [average_violence_scores[country] for country in countries]

plt.figure(figsize=(10, 5))
plt.bar(countries, averages)
plt.xlabel('Country of Development')
plt.ylabel('Average Violence Score')
plt.title('Average Violence Score by Country of Development')
plt.xticks(rotation=90)  # Rotate country names for better readability
plt.tight_layout()  # Adjust layout to prevent clipping of tick-labels

# Add a line for the overall average violence level
plt.axhline(y=overall_average_violence, color='r', linestyle='-', label=f'Overall Average ({overall_average_violence:.2f})')
plt.legend()
plt.show()


In [None]:
unique_categories = Counter()
for game, content in game_data.items():
        unique_categories.update(content['categories'])

# Display the unique keys
unique_categories

In [None]:
with open("categories_counter.json", 'w') as file:
    json.dump(unique_categories, file)