# Data Collecting    

## Data Scraping

In [None]:
import os
import json
import pywikibot

def get_subcategories_from_category(category_name):
    site = pywikibot.Site('en', 'wikipedia')
    cat = pywikibot.Category(site, category_name)
    pages = list(cat.subcategories())
    return [page.title() for page in pages]

def get_games_from_category(category_name):
    site = pywikibot.Site('en', 'wikipedia')
    cat = pywikibot.Category(site, category_name)
    pages = list(cat.articles())
    return pages

def save_to_json(page, subcategory):
    data = {
        "text": page.text,
        "categorie": subcategory,
        "title": page.title(),
        "url": page.full_url(),
    }

    directory = os.path.join('./data', subcategory.split(':')[-1])
    if not os.path.exists(directory):
        os.makedirs(directory)

    filepath = os.path.join(directory, page.title().replace('/', '_') + '.json')
    with open(filepath, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)

# Start with the base category
category_name = 'Category:Video games by year'
game_categories = get_subcategories_from_category(category_name)

# Go through each sub-category to get the game pages
for game_category in game_categories:
    # Make sure we only look at the relevant categories (avoiding 'by decade' etc.)
    if 'video games' in game_category.lower():
        game_pages = get_games_from_category(game_category)
        for page in game_pages:
            save_to_json(page, game_category.split(':')[-1])



## Data processing 

In [None]:
import os
import json
import csv
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def get_all_page_names_from_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
    return data['title']



base_directory = 'project/data'
all_page_names = []

# Step 1: Get all page names
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(get_all_page_names_from_json, os.path.join(base_directory, subcategory, json_file))
                for subcategory in os.listdir(base_directory)
                for json_file in os.listdir(os.path.join(base_directory, subcategory))]
    
    for future in tqdm(futures, desc="Fetching Page Names", unit="files"):
        all_page_names.append(future.result())

# Save all page names to CSV
with open('all_page_names.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Page Names'])
    for page_name in all_page_names:
        writer.writerow([page_name])


In [None]:
import os
import json
import re
import csv
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def extract_outlinks(text):
    """Extract Wikipedia outlinks (wiki links) from the text."""
    links = re.findall(r'\[\[(?:[^|\]]*\|)?([^\]]+)\]\]', text)
    return [link.split('#')[0] for link in links]

def extract_categories(text):
    """Extract categories from the text."""
    return re.findall(r'\[\[Category:(.*?)\]\]', text)

def get_all_page_names_from_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
    return data['title']

# Generic function to update JSON files
def update_json(filepath, update_func, *args):
    with open(filepath, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
    
    update_func(data, *args)  # Call the provided update function
    
    with open(filepath, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)


def update_json_with_country_of_development(data):
    # Extract countries of development based on the categories
    countries_of_development = []
    for category in data.get('categories', []):
        match = re.match(r'Video games developed in (.+)', category)
        if match:
            country = match.group(1)
            countries_of_development.append(country)
    
    # If no countries are found, set to ["other"]
    data['country of development'] = countries_of_development if countries_of_development else ["other"]
    
                  
# Specific update methods
def filter_outlinks(data, all_page_names):
    data['outpages'] = [link for link in extract_outlinks(data['text']) if link in all_page_names]

def add_categories(data):
    data['categories'] = extract_categories(data['text'])


def integration_test_on_one_file(base_directory, all_page_names):
    test_subcategory = os.listdir(base_directory)[0]
    test_file = os.listdir(os.path.join(base_directory, test_subcategory))[0]
    test_filepath = os.path.join(base_directory, test_subcategory, test_file)
    
    print("Running integration test on:", test_filepath)
    update_json(test_filepath, filter_outlinks, all_page_names)
    update_json(test_filepath, add_categories)
    print("Test complete. Please verify", test_filepath)


def load_csv_as_list(filepath):
    with open(filepath, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        # Skip the header row if your CSV has one
        next(reader, None)
        return list(reader)




base_directory = 'project/data'

# Step 2: Update JSON files with filtered outlinks and categories
json_files = [os.path.join(base_directory, subcategory, json_file)
                for subcategory in os.listdir(base_directory)
                for json_file in os.listdir(os.path.join(base_directory, subcategory))]

with ThreadPoolExecutor() as executor:
    list(tqdm(executor.map(lambda x: update_json(x, filter_outlinks, all_page_names), json_files), 
              total=len(json_files), desc="Filtering Outlinks", unit="files"))
    list(tqdm(executor.map(lambda x: update_json(x, add_categories), json_files), 
              total=len(json_files), desc="Adding Categories", unit="files"))
    list(tqdm(executor.map(lambda x: update_json(x, update_json_with_country_of_development), json_files), 
        total=len(json_files), desc="Adding cantry of development", unit="files"))

# # Run integration test
integration_test_on_one_file(base_directory, all_page_names)


### extracting subsection

In [None]:
import os
import json
import mwparserfromhell

def parse_wiki_text_to_sections(wiki_text):
    # Parse the text with mwparserfromhell
    wikicode = mwparserfromhell.parse(wiki_text, skip_style_tags=True)
    
    sections_dict = {}
    current_section = 'introduction'
    sections_dict[current_section] = ''  # Initialize the intro section

    # Iterate through the parsed wiki code
    for node in wikicode.nodes:
        if isinstance(node, mwparserfromhell.nodes.heading.Heading):
            # When we find a heading, set the current section to the heading's title
            current_section = str(node.title).strip().lower()
            sections_dict[current_section] = ''
        else:
            # Otherwise, append the text of this node to the current section
            sections_dict[current_section] += str(node)

    # Clean up text for each section
    for section, text in sections_dict.items():
        # Remove references and other unwanted parts
        text = mwparserfromhell.parse(text).strip_code()
        sections_dict[section] = text

    return sections_dict

def load_json_data_to_dict(base_directory):
    all_data = {}  # Dictionary to hold all data from JSON files, keyed by the title
    for subdir, dirs, files in os.walk(base_directory):
        for file in files:
            if file.endswith('.json'):
                filepath = os.path.join(subdir, file)
                with open(filepath, 'r', encoding='utf-8') as json_file:
                    data = json.load(json_file)
                    data["text"] = parse_wiki_text_to_sections(data["text"])
                    # Use the title as the key for the dictionary
                    all_data[data['title']] = data
    return all_data

# Replace with the actual path to your data directory
base_directory = './data'  # Update with the actual path
data_file = 'game_data.json'

if os.path.exists(data_file):
    with open(data_file) as f:
        json_data_dict = json.load(f)
else:
    json_data_dict = load_json_data_to_dict(base_directory)
    with open('game_data.json', 'w', encoding='utf-8') as json_file:
        json.dump(json_data_dict, json_file, ensure_ascii=False, indent=4)

# Sentiment analysis

## Helping functions

In [None]:
import json

data_file = '../game_data.json'


with open(data_file, encoding="utf-8") as f:
    game_data = json.load(f)

In [None]:
import matplotlib.pyplot as plt

def create_histogram(data, title, xlabel, ylabel, color='tab:blue', figuresize=(10, 6)):
    labels = list(data.keys())
    values = list(data.values())

    n_bars = len(labels)
    figure_width = max(labels) - min(labels)

    bar_width = figure_width / (1.5 * n_bars)

    plt.figure(figsize=figuresize)
    plt.bar(labels, values, color=color, edgecolor='black', width=bar_width)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import os
import json


analyzer = SentimentIntensityAnalyzer()
# Function to get sentiment scores
weight_neg = 0.7
weight_compound = 1-  weight_neg 

def normalize_score(score, old_min, old_max, new_min, new_max):
    return ((score - old_min) / (old_max - old_min)) * (new_max - new_min) + new_min

def get_violence_score(neg_score, pos_score, weight_neg, weight_compound):
    # Normalize 'neg' score from [0, 1] to [-1, 1]
    normalized_neg_score = normalize_score(neg_score, 0, 0.35, 0, 1)
    normalized_pos_score = normalize_score(pos_score, 0, 0.3, 0, 1)
    
    # Calculate the weighted score
    violence_score = normalized_neg_score * weight_neg - normalized_pos_score * weight_compound
    
    return violence_score


def get_sentiment_scores(data, subsections = ["gameplay"]):
    sentiment_scores = {}
    for title, content in data.items():
        text = "\n".join([content['text'][subsection] for subsection in subsections if subsection in content["text"]])
        sentiment = analyzer.polarity_scores(text)
        sentiment["violence"] = get_violence_score(sentiment["neg"], sentiment["pos"], weight_neg, weight_compound)
        sentiment_scores[title] = sentiment
    return sentiment_scores

# Function to create a histogram of sentiment scores
def create_sentiment_histograms(sentiment_data, sentiments_to_plot=None):
    # Default to all sentiment types if none are specified
    if sentiments_to_plot is None:
        sentiments_to_plot = ['pos', 'neu', 'neg', 'compound']

    # Determine the number of plots
    num_plots = len(sentiments_to_plot)
    cols = 2  # We prefer a 2-column layout
    rows = (num_plots + 1) // cols  # Calculate rows needed

    # Setting up the figure for multiple subplots
    fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
    if rows > 1:
        axes = axes.flatten()  # Flatten if we have more than one row
    else:
        axes = [axes]  # Wrap in list if only one row (i.e., 1 or 2 plots)

    fig.suptitle('Sentiment Analysis Histograms')

    # Plotting each requested sentiment
    for i, sentiment in enumerate(sentiments_to_plot):
        scores = [details[sentiment] for details in sentiment_data.values()]
        ax = axes[i]
        ax.hist(scores, bins=200, color='tab:blue', edgecolor='black')
        ax.set_title(f'{sentiment.capitalize()} Sentiment Score')
        ax.set_xlabel('Sentiment Score')
        ax.set_ylabel('Number of Games')

    # Turn off any unused subplots
    for j in range(i + 1, len(axes)):
        axes[j].axis('off')

    # Adjust layout for better spacing
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

def print_top_bottom_sentiment_games(sentiment_scores, key='compound', n = 10):
    # Sorting the games based on the compound sentiment score
    sorted_games = sorted(sentiment_scores.items(), key=lambda x: x[1][key])
    

    # Printing the 10 most negative games
    print(f"{n} Most Negative Games in terms of {key} sentiment:")
    for game, score in sorted_games[:n]:
        print(f"{game}: {score}")

    print("\n")  # Adding a newline for better readability

    # Printing the 10 most positive games
    print(f"{n} Most Positive Games in terms of {key} sentiment:")
    for game, score in sorted_games[-n:]:
        print(f"{game}: {score}")

def is_subsection_length_valid(data, subsections, min_length, max_length):
    """
    Check if the length of a subsection is within the specified range.
    
    :param data: The data dictionary of a game
    :param subsection: The subsection to check within the data
    :param min_length: The minimum length of the subsection string
    :param max_length: The maximum length of the subsection string
    :return: True if the length is within range, False otherwise
    """
    subsection_text = ""
    for subsection in subsections:
        subsection_text += data.get('text', {}).get(subsection, "") + "\n"
    word_count = len(subsection_text.split())
    return min_length <= word_count <= max_length

def count_long_subsections(game_data, subsection='gameplay', min_length=0, max_length=float('inf')):
    """
    Counts the number of entries in the game_data dictionary that have a specified subsection
    with a string length within the given range.
    
    :param game_data: Dictionary containing game information
    :param subsection: The subsection to look for within the entries (default is 'gameplay')
    :param min_length: The minimum length of the subsection string to count (default is 0)
    :param max_length: The maximum length of the subsection string to count (default is infinity)
    :return: The count of entries with the subsection string length within the specified range
    """
    return sum(is_subsection_length_valid(data, subsection, min_length, max_length) for _, data in game_data.items())

def filter_entries_by_length(game_data, subsections=['gameplay'], min_length=0, max_length=float('inf')):
    """
    Creates a dictionary with only the entries from game_data that have a specified subsection
    with a string length within the given range.
    
    :param game_data: Dictionary containing game information
    :param subsection: The subsection to look for within the entries (default is 'gameplay')
    :param min_length: The minimum length of the subsection string to filter by (default is 0)
    :param max_length: The maximum length of the subsection string to filter by (default is infinity)
    :return: A new dictionary with filtered entries
    """
    return {game: data for game, data in game_data.items() if is_subsection_length_valid(data, subsections, min_length, max_length)}


def get_or_create_value(function_to_apply, file_path):
    # Check if the file exists
    if os.path.exists(file_path):
        # Load the value from the file
        with open(file_path, 'r') as file:
            value = json.load(file)
    else:
        # Call the function to create the value
        value = function_to_apply()
        # Save the value to the file
        with open(file_path, 'w') as file:
            json.dump(value, file)
    
    return value


## Creating the sentiment dictionart

In [None]:
constrains = {
     "min_length": 100,
    "max_length": 10000
}
subsections = ['gameplay','plot','story','synopsis','plot and gameplay','plot and gameplay']

filtered_by_length = filter_entries_by_length(game_data, subsections=subsections, **constrains)
sentiment_filtered_by_length = get_or_create_value( lambda: get_sentiment_scores(filtered_by_length, subsections), file_path= "filtered_gameplay_sentiment.json")

In [None]:
violence = [sentiment["violence"] for game, sentiment in sentiment_filtered_by_length.items()]
for game, sentiment in sentiment_filtered_by_length.items():
    sentiment["violence"] = normalize_score(sentiment["violence"], min(violence), max(violence), 0, 1)

In [None]:
create_sentiment_histograms(sentiment_data=sentiment_filtered_by_length, sentiments_to_plot= ['pos', 'neu', 'neg', 'compound', "violence"])

In [None]:
game_data_with_vader_sentiment_on_gameplay = get_or_create_value(lambda: {
    game: {**data, 'sentiment': sentiment_filtered_by_length[game]}
    for game, data in filtered_by_length.items() }, "game_data_with_vader_sentiment.json")

## Testing different scores

In [None]:
import pandas as pd

def compare_violence_scores(data, most_violent, least_violent):
    # Initialize dictionaries to hold the scores for the most and least violent games
    most_violences = {'neg': [], 'neu': [], 'pos': [], 'compound': [], "violence": []}
    least_violences = {'neg': [], 'neu': [], 'pos': [], 'compound': [], "violence": []}
    
    # Helper function to calculate average of a list
    def average(lst):
        return sum(lst) / len(lst) if lst else 0
    
    # Extract scores for each game in the most and least violent games lists
    for game in most_violent:
        if game in data:
            most_violences['neg'].append(data[game]['neg'])
            most_violences['neu'].append(data[game]['neu'])
            most_violences['pos'].append(data[game]['pos'])
            most_violences['compound'].append(data[game]['compound'])
            most_violences['violence'].append(data[game]['violence'])
    
    for game in least_violent:
        if game in data:
            least_violences['neg'].append(data[game]['neg'])
            least_violences['neu'].append(data[game]['neu'])
            least_violences['pos'].append(data[game]['pos'])
            least_violences['compound'].append(data[game]['compound'])
            least_violences['violence'].append(data[game]['violence'])
    
    # Calculate averages for both groups
    averages = {
        'Score': ['neg', 'neu', 'pos', 'compound', "violence"],
        'Average Most Violent': [
            average(most_violences['neg']),
            average(most_violences['neu']),
            average(most_violences['pos']),
            average(most_violences['compound']),
            average(most_violences['violence'])
        ],
        'Average Least Violent': [
            average(least_violences['neg']),
            average(least_violences['neu']),
            average(least_violences['pos']),
            average(least_violences['compound']),
            average(least_violences['violence'])
        ]
    }
    
    # Create a DataFrame to display the table
    df = pd.DataFrame(averages)
    
    # Calculate and add a column for the difference between the most and least violent scores
    df['Difference (Most - Least)'] = df['Average Most Violent'] - df['Average Least Violent']
    
    return df

most_violent = ["Doom (2016 video game)", "Grand Theft Auto V", "Mortal Kombat (1992 video game)", "God of War (2005 video game)", "Manhunt (video game)", "Gears of War (video game)", "Call of Duty 4: Modern Warfare", "Dead Space (2008 video game)", "Resident Evil (1996 video game)", "Hotline Miami"]
least_violent = ["Animal Crossing: New Horizons", "Stardew Valley", "The Sims 4", "Minecraft", "Tetris", "Monument Valley (video game)", "Super Mario Odyssey", "Journey (2012 video game)", "Katamari Damacy", "Fez (video game)"]
compare_violence_scores(sentiment_filtered_by_length, most_violent, least_violent)

## Analyzing sentiment

In [None]:
import os
import json
import matplotlib.pyplot as plt
import scienceplots

plt.style.use('science')

In [None]:
game_data = get_or_create_value(lambda: {}, "game_data_with_vader_sentiment.json")

### extracting production year from category

In [None]:
import re
for game, data in game_data.items():
    match = re.match(r'^\d+', data["categorie"])
    if match:
        data["year"] = int(match.group())
    else:
        raise RuntimeError()
    data.pop("categorie")

## Sentiment per Year

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

from collections import defaultdict
import numpy as np

def calculate_sentiment_statistics(game_data: dict, sentiment_key: str):
    """
    Calculate the average and standard deviation of specified sentiment scores per year from a dataset
    where each entry is a dictionary containing a year and a nested 'sentiment' dictionary with sentiment scores.

    :param game_data: Dictionary of dictionaries with 'year' and nested 'sentiment' dictionary.
    :param sentiment_key: Key for the sentiment score to be calculated (e.g., 'violence', 'compound').
    :return: Two dictionaries with years as keys and average scores and standard deviations as values.
    """
    sentiment_sum_per_year = defaultdict(float)
    count_per_year = defaultdict(int)

    # Sum sentiment scores and count entries for each year
    for game in game_data.values():
        try:
            year = game['year']
            sentiment_score = game["sentiment"][sentiment_key]
            sentiment_sum_per_year[year] += sentiment_score
            count_per_year[year] += 1
        except KeyError as e:
            print(f"Missing key in data: {e}")

    # Calculate the average sentiment score for each year
    average_sentiment_per_year = {year: sentiment_sum_per_year[year] / count_per_year[year]
                                  for year in sentiment_sum_per_year}

    # Calculate standard deviation for each year
    std_dev_per_year = {year: np.std([game["sentiment"][sentiment_key] 
                                      for game in game_data.values() if game['year'] == year])
                        for year in count_per_year}

    return average_sentiment_per_year, std_dev_per_year



def plot_violence_statistics(average_violence, std_deviation):
    """
    Plot the violence statistics.

    :param average_violence: Dictionary of average violence scores per year.
    :param std_deviation: Dictionary of standard deviations per year.
    """
    # Sort the data by year
    sorted_years = sorted(average_violence.keys())
    average_violences = [average_violence[year] for year in sorted_years]
    std_devs = [std_deviation[year] for year in sorted_years]

    # Plotting with enhancements
    plt.figure(figsize=(12, 6))

    # Plot average violence scores with error bars
    plt.errorbar(sorted_years, average_violences, yerr=std_devs, fmt='-o',
                 label='Average with Std Dev', color='blue')

    # Fit and plot a trend line
    z = np.polyfit(sorted_years, average_violences, 1)
    p = np.poly1d(z)
    plt.plot(sorted_years, p(sorted_years), "r--", label='Trend Line')

    # Labels and title
    plt.title('Violence Scores Over the Years with Trend and Variability')
    plt.xlabel('Year')
    plt.ylabel('Violence Score')
    plt.legend()
    plt.grid(True)

    # Show the plot
    plt.show()


average_violence, std_deviation = calculate_sentiment_statistics(game_data, "violence")
plot_violence_statistics(average_violence, std_deviation)

## Sentiment per country

In [None]:
import matplotlib.pyplot as plt
from collections import defaultdict, Counter

# Assuming game_data is a dictionary where the key is the game name and the value is another dictionary
# that includes 'country of development' and 'violence' score within a 'sentiment' sub-dictionary.

# Step 1: Extract relevant data
country_violence_data = defaultdict(list)
for game, details in game_data.items():
    # We assume 'violence' scores are within a 'sentiment' sub-dictionary
    violence_score = details["sentiment"]["violence"]
    for country in details["country of development"]:
        country_violence_data[country].append(violence_score)

# Step 2: Count the number of games per country
game_counts = Counter({country: len(scores) for country, scores in country_violence_data.items()})

# Step 3: Calculate average violence score for each country with at least 100 games or is Denmark
average_violence_scores = {}
for country, violence_scores in country_violence_data.items():
    if (game_counts[country] >= 100 or country == "Denmark") and country != "other":
        average_violence_scores[country] = sum(violence_scores) / len(violence_scores)

# Step 4: Add all other countries to 'Others'
other_violence_scores = []
for country, violence_scores in country_violence_data.items():
    if (game_counts[country] < 100 and country != "Denmark") or "other":
        other_violence_scores.extend(violence_scores)
if other_violence_scores:
    average_violence_scores["Others"] = sum(other_violence_scores) / len(other_violence_scores)


# Calculate overall average violence score
all_scores = [score for scores in country_violence_data.values() for score in scores]
overall_average_violence = sum(all_scores) / len(all_scores)

# Step 5: Plot the average violence scores
countries = sorted(list(average_violence_scores.keys()), key= lambda x: average_violence_scores[x]) 
averages = [average_violence_scores[country] for country in countries]

plt.figure(figsize=(10, 5))
plt.bar(countries, averages)
plt.xlabel('Country of Development')
plt.ylabel('Average Violence Score')
plt.title('Average Violence Score by Country of Development')
plt.xticks(rotation=90)  # Rotate country names for better readability
plt.tight_layout()  # Adjust layout to prevent clipping of tick-labels

# Add a line for the overall average violence level
plt.axhline(y=overall_average_violence, color='r', linestyle='-', label=f'Overall Average ({overall_average_violence:.2f})')
plt.legend()
plt.show()