# Part 1: Data Collection (Ubisoft - External)
- Collecting data that mentions Ubisoft across the various subreddits we are trying to surf from
- Adding prerequisites for the data being collected
    - Post must be more 20 words 
    - Account must be more than 1 week old
    - Account must have more than 10 karma
    - Posts will be collected from the past 1 year


## (1) Import necessary libraries

In [7]:
%pip install praw

Note: you may need to restart the kernel to use updated packages.


In [1]:
import praw
import pandas as pd
from datetime import datetime
# import csv

import os 
import csv
from dotenv import load_dotenv
import re
import time
from typing import List, Dict


In [2]:
# bring in env variables 
load_dotenv()

CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
USER_AGENT = os.getenv("USER_AGENT")
USERNAME = os.getenv("USERNAME")
PASSWORD = os.getenv("PASSWORD")

print("Env variables loaded")

Env variables loaded


In [3]:
# initialise connection with reddit
reddit = praw.Reddit(client_id=CLIENT_ID, 
                     client_secret=CLIENT_SECRET, 
                     user_agent=USER_AGENT, 
                     username=USERNAME, 
                     password=PASSWORD)

Version 7.7.1 of praw is outdated. Version 7.8.1 was released Friday October 25, 2024.


# (2) Data Scraping 
Within this subsection, we aim to focus mainly on the <strong>EXTERNAL</strong> factors as to why Ubisoft might have failed as a company. We will begin by doing a targeted, unbiased search of all things related to Ubisoft and their games within specific subreddits using neutral keywords. The subreddits that we will be focusing on are 

- r/Ubisoft 
- r/AssassinsCreed
- r/Rainbow6
- r/TheDivision
- r/GhostRecon
- r/ForHonorGame
- r/WatchDogs
- r/Gaming
- r/Games
- r/PCGaming
- r/VideoGameNews
- r/Steam
- r/CrackWatch (for checking how well Ubisoft monitors piracy + bug fixing issues)

The keywords that we will be using to target this search would be 

- (Ubisoft) Ubisoft, Ubi Soft, ubisoft, ubi, ubisofts, Ubisofts, ubis
- (Locations) Ubisoft [geographic location]
- (Platforms) Uplay, UConnect, Ubisoft Connect, Ubisoft Store, Login, PC, Playstation, Xbox, Nintendo Switch, Luna
- (Events) Ubisoft Forward, E3, Showcase, EA + Ubisoft
- (Games/Acronyms) AC, Assassin's Creed, Far Cry, FC5, FC6, Tom Clancy, R6S, R6 Siege, R6 Extraction, Tom Clancy's Rainbow Six, Tom Clancy's The Division, Watch Dogs, Ghost Recon, Just Dance, Prince of Persia, Splinter Cell (this will mainly be used in the r/gaming and more general subreddits), Skull and Bones, Beyond Good and Evil, Beyond Good and Evil 2, Riders Republic, Immortals Fenyx Rising, AC Valhalla, AC Origins, 
- (Game Features) Updates, DLC, Expansion, Patch, season pass, Open World, RPG, multiplayer, speed



In [5]:
import re
import time
from datetime import datetime
from typing import List, Dict
import pandas as pd

# List of subreddits
ubisoft_subreddits = [
    'Ubisoft', 'assassinscreed', 'Rainbow6', 'GhostRecon', 'thedivision',
    'farcry', 'farcry5', 'farcry6', 'watch_dogs', 'forhonor',
    'Splintercell', 'PrinceOfPersia', 'JustDance', 'Steep', 'TrialsGames',
    'anno', 'FenyxRising', 'SkullAndBones', 'ACValhalla',
    'AssassinsCreedOdyssey', 'AssassinsCreedOrigins'
]

gaming_subreddits = [
    'gaming', 'pcgaming', 'PS4', 'XboxOne', 'NintendoSwitch',
    'Steam', 'PS5', 'XboxSeriesX'
]

subreddits = ubisoft_subreddits + gaming_subreddits

# Updated pattern strictly for 'Ubisoft' and its variants
ubisoft_pattern = re.compile(r'\bubi(?:soft|sf|)\b', re.IGNORECASE)

# Timestamp for one-year-old posts
now = int(time.time())
one_year_ago = now - 365 * 24 * 60 * 60  

def collect_posts_multiple_methods(subreddits: List[str], after_timestamp: int) -> List[Dict]:
    """
    Collects posts from specified subreddits using multiple sorting methods.
    Filters posts that mention 'Ubisoft' or its variants and have more than 20 words + are at max a year old.
    
    Parameters:
    - subreddits: list of subreddit names to collect posts from.
    - after_timestamp: filter posts created after this timestamp (Unix time).
    
    Returns:
    - List of dictionaries containing post data.
    """
    posts = []
    sorting_methods = ['new', 'hot', 'top', 'rising', 'controversial']  # Sorting methods for broader search
    total_collected = 0

    for subreddit_name in subreddits:
        print(f"\nCollecting submissions from r/{subreddit_name}")
        subreddit = reddit.subreddit(subreddit_name)
        subreddit_post_ids = set()  # To avoid duplicates
        for method in sorting_methods:
            print(f"Using sorting method: {method}")
            method_post_count = 0
            try:
                # Collecting a larger number of posts if possible
                submissions = getattr(subreddit, method)(limit=5000)  # Increased limit
                for submission in submissions:
                    if submission.created_utc < after_timestamp:  # Posts must be no older than one year
                        break
                    if submission.id in subreddit_post_ids:
                        continue  
                    subreddit_post_ids.add(submission.id)
                    title = submission.title.lower()
                    body = submission.selftext.lower()
                    combined_text = f"{title} {body}"
                    if ubisoft_pattern.search(combined_text):  # Filtering strictly for 'Ubisoft' mentions
                        word_count = len(combined_text.split())
                        if word_count >= 20:  # Post must have more than 20 words
                            posts.append({
                                'id': submission.id,
                                'author': submission.author.name if submission.author else 'deleted',
                                'date_created': datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                                'title': submission.title,
                                'score': submission.score,
                                'num_of_comments': submission.num_comments,
                                'body': submission.selftext,
                                'url': submission.url,
                                'subreddit': subreddit_name
                            })
                            method_post_count += 1
                            total_collected += 1
                            print(f"Added submission {submission.id} from r/{subreddit_name} using {method}")

                print(f"Collected {method_post_count} posts from r/{subreddit_name} using {method}")
                time.sleep(2)  # Rate limit handling, consider using a proxy for higher volumes
            except Exception as e:
                print(f"An error occurred in r/{subreddit_name} using {method}: {e}")
                continue
    print(f"\nTotal posts collected: {total_collected}")
    return posts

# Processing the posts to save data as CSV and Excel files
posts = collect_posts_multiple_methods(subreddits, one_year_ago)
df = pd.DataFrame(posts)
df.drop_duplicates(subset='id', inplace=True)
csv_filename = 'reddit_ubisoft_external_posts.csv'
df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
print(f"\nData saved to {csv_filename}")
excel_filename = 'reddit_ubisoft_external_posts.xlsx'
df.to_excel(excel_filename, index=False)
print(f"Data saved to {excel_filename}")



Collecting submissions from r/Ubisoft
Using sorting method: new
Added submission 1gnwld2 from r/Ubisoft using new
Added submission 1gnqa8a from r/Ubisoft using new
Added submission 1gn3kzt from r/Ubisoft using new
Added submission 1gmw2hw from r/Ubisoft using new
Added submission 1gm63g0 from r/Ubisoft using new
Added submission 1glxzad from r/Ubisoft using new
Added submission 1gln3uz from r/Ubisoft using new
Added submission 1gl9pb5 from r/Ubisoft using new
Added submission 1gkzvse from r/Ubisoft using new
Added submission 1gkeskd from r/Ubisoft using new
Added submission 1gkn99u from r/Ubisoft using new
Added submission 1gkit2p from r/Ubisoft using new
Added submission 1gkczjt from r/Ubisoft using new
Added submission 1gkbc5j from r/Ubisoft using new
Added submission 1gk4my0 from r/Ubisoft using new
Added submission 1gj8x9n from r/Ubisoft using new
Added submission 1gji7pu from r/Ubisoft using new
Added submission 1gip013 from r/Ubisoft using new
Added submission 1gibrqz from r/Ubi

## Data scraping specific for network analytics

In [6]:
import re
import time
from datetime import datetime
from typing import List, Dict
import pandas as pd

# List of subreddits
ubisoft_subreddits = [
    'Ubisoft', 'assassinscreed', 'Rainbow6', 'GhostRecon', 'thedivision',
    'farcry', 'farcry5', 'farcry6', 'watch_dogs', 'forhonor',
    'Splintercell', 'PrinceOfPersia', 'JustDance', 'Steep', 'TrialsGames',
    'anno', 'FenyxRising', 'SkullAndBones', 'ACValhalla',
    'AssassinsCreedOdyssey', 'AssassinsCreedOrigins'
]

gaming_subreddits = [
    'gaming', 'pcgaming', 'PS4', 'XboxOne', 'NintendoSwitch',
    'Steam', 'PS5', 'XboxSeriesX'
]

subreddits = ubisoft_subreddits + gaming_subreddits

# Updated pattern strictly for 'Ubisoft' and its variants
ubisoft_pattern = re.compile(r'\bubi(?:soft|sf|)\b', re.IGNORECASE)

# Timestamp for one-year-old posts
now = int(time.time())
one_year_ago = now - 365 * 24 * 60 * 60  

def collect_posts_multiple_methods(subreddits: List[str], after_timestamp: int) -> List[Dict]:
    """
    Collects posts from specified subreddits using multiple sorting methods.
    Filters posts that mention 'Ubisoft' or its variants and have more than 20 words + are at max a year old.
    
    Parameters:
    - subreddits: list of subreddit names to collect posts from.
    - after_timestamp: filter posts created after this timestamp (Unix time).
    
    Returns:
    - List of dictionaries containing post data for network visualization.
    """
    posts = []
    interactions = []  # List to hold interactions for network visualization
    sorting_methods = ['new', 'hot', 'top', 'rising', 'controversial']  # Sorting methods for broader search
    total_collected = 0

    for subreddit_name in subreddits:
        print(f"\nCollecting submissions from r/{subreddit_name}")
        subreddit = reddit.subreddit(subreddit_name)
        subreddit_post_ids = set()  # To avoid duplicates
        for method in sorting_methods:
            print(f"Using sorting method: {method}")
            method_post_count = 0
            try:
                # Collecting a larger number of posts if possible
                submissions = getattr(subreddit, method)(limit=5000)  # Increased limit
                for submission in submissions:
                    if submission.created_utc < after_timestamp:  # Posts must be no older than one year
                        break
                    if submission.id in subreddit_post_ids:
                        continue  
                    subreddit_post_ids.add(submission.id)
                    title = submission.title.lower()
                    body = submission.selftext.lower()
                    combined_text = f"{title} {body}"
                    if ubisoft_pattern.search(combined_text):  # Filtering strictly for 'Ubisoft' mentions
                        word_count = len(combined_text.split())
                        if word_count >= 20:  # Post must have more than 20 words
                            posts.append({
                                'id': submission.id,
                                'author': submission.author.name if submission.author else 'deleted',
                                'date_created': datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                                'title': submission.title,
                                'score': submission.score,
                                'num_of_comments': submission.num_comments,
                                'body': submission.selftext,
                                'url': submission.url,
                                'subreddit': subreddit_name
                            })

                            # Adding interaction data for network analysis (simplified example)
                            for comment in submission.comments:
                                if hasattr(comment, 'author') and comment.author:
                                    interactions.append({
                                        'Source': submission.author.name if submission.author else 'deleted',
                                        'Target': comment.author.name,
                                        'Weight': 1,  # Default weight of 1 for each interaction
                                        'Subreddit': subreddit_name,
                                        'Date': datetime.fromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S')
                                    })

                            method_post_count += 1
                            total_collected += 1
                            print(f"Added submission {submission.id} from r/{subreddit_name} using {method}")

                print(f"Collected {method_post_count} posts from r/{subreddit_name} using {method}")
                time.sleep(2)  # Rate limit handling, consider using a proxy for higher volumes
            except Exception as e:
                print(f"An error occurred in r/{subreddit_name} using {method}: {e}")
                continue
    print(f"\nTotal posts collected: {total_collected}")
    return posts, interactions

# Processing the posts and interactions to save data as CSV files for network visualization
posts, interactions = collect_posts_multiple_methods(subreddits, one_year_ago)
posts_df = pd.DataFrame(posts)
posts_df.drop_duplicates(subset='id', inplace=True)
csv_filename = 'reddit_ubisoft_network_posts.csv'
posts_df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
print(f"\nData saved to {csv_filename}")

interactions_df = pd.DataFrame(interactions)
if not interactions_df.empty:
    interactions_filename = 'reddit_ubisoft_network_interactions.csv'
    interactions_df.to_csv(interactions_filename, index=False, encoding='utf-8-sig')
    print(f"Interaction data saved to {interactions_filename}")



Collecting submissions from r/Ubisoft
Using sorting method: new
Added submission 1gnwld2 from r/Ubisoft using new
Added submission 1gnqa8a from r/Ubisoft using new
Added submission 1gn3kzt from r/Ubisoft using new
Added submission 1gmw2hw from r/Ubisoft using new
Added submission 1gm63g0 from r/Ubisoft using new
Added submission 1glxzad from r/Ubisoft using new
Added submission 1gln3uz from r/Ubisoft using new
Added submission 1gl9pb5 from r/Ubisoft using new
Added submission 1gkzvse from r/Ubisoft using new
Added submission 1gkeskd from r/Ubisoft using new
Added submission 1gkn99u from r/Ubisoft using new
Added submission 1gkit2p from r/Ubisoft using new
Added submission 1gkczjt from r/Ubisoft using new
Added submission 1gkbc5j from r/Ubisoft using new
Added submission 1gk4my0 from r/Ubisoft using new
Added submission 1gj8x9n from r/Ubisoft using new
Added submission 1gji7pu from r/Ubisoft using new
Added submission 1gip013 from r/Ubisoft using new
Added submission 1gibrqz from r/Ubi