In [4]:
import pandas as pd
import requests
import re
from rapidfuzz import process, fuzz
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from tqdm import tqdm

In [5]:
# === Step 1: Load dataset ===
csv_path = "D:/hp/Documents/CDS513/Assignment/steam-200k.csv"
df = pd.read_csv(csv_path, header = None, 
                 names=["Player_ID", "Name", "Behavior", "Hours", "Ignore"])

game_names = df['Name'].astype(str).tolist()
df['Name'] = df['Name'].astype(str)

In [6]:
# === Step 2: Get all Steam apps ===
print("Downloading Steam app list...")
applist_url = "https://api.steampowered.com/ISteamApps/GetAppList/v2/"
response = requests.get(applist_url)
apps = response.json()['applist']['apps']
app_df = pd.DataFrame(apps)

Downloading Steam app list...


In [7]:
# === Normalize function ===
def normalize(text):
    text = str(text).lower()
    return re.sub(r'[^\w\s]', '', text).strip()

df['normalized_name'] = df['Name'].apply(normalize)
app_df['normalized_name'] = app_df['name'].apply(normalize)
steam_names = app_df['normalized_name'].tolist()

In [8]:
# === Step 3: Parallel fuzzy matching ===
print("Matching game names in parallel...")

def match_game(name):
    match, score, idx = process.extractOne(name, steam_names, scorer=fuzz.token_sort_ratio)
    steam_row = app_df.iloc[idx]
    return steam_row['name'], steam_row['appid'], score

Matching game names in parallel...


In [None]:
from concurrent.futures import ProcessPoolExecutor

results = []
with ThreadPoolExecutor(max_workers=16) as executor:
    
    print("Loop 1")
    
    futures = [executor.submit(match_game, name) for name in tqdm(df['normalized_name'], desc="Submitting tasks")]
    
    print("Loop 2")
    
    for f in tqdm(as_completed(futures), total=len(futures), desc="Fuzzy Matching"):
        
        print("Loop 3")
        
        results.append(f.result())

Loop 1


Submitting tasks:   0%|                                                         | 92/200000 [01:23<51:21:59,  1.08it/s]

In [None]:
matched_names, appids, scores = zip(*results)
df['matched_name'] = matched_names
df['appid'] = appids
df['match_score'] = scores

In [None]:
# === Step 4: Parallel genre + release year fetch ===
print("Fetching genre and release year info...")

def fetch_genre_and_year(appid):
    url = f"https://store.steampowered.com/api/appdetails?appids={appid}"
    try:
        res = requests.get(url, timeout=5)
        data = res.json().get(str(appid), {}).get('data', {})
        release = data.get('release_date', {}).get('date', '')
        match = re.search(r'\b(\d{4})\b', release)
        year = int(match.group(1)) if match else None
        genres = [g['description'] for g in data.get('genres', [])] if year and year <= 2017 else []
        return ", ".join(genres), year, year is None or year > 2017
    except:
        return None, None, True

genre_results=[]
with ThreadPoolExecutor(max_workers=16) as executor:
    genre_futures = [executor.submit(fetch_genre_and_year, aid) for aid in tqdm(df['appid'], desc="📤 Submitting genre tasks")]
    for f in tqdm(as_completed(futures), total=len(futures), desc="📚 Fetching genre/release"):
        genre_results.append(f.result())

In [None]:
# Unpack results
genre_list, release_years, filtered_flags = zip(*genre_results)
df['genres'] = genre_list
df['release_year'] = release_years
df['filtered_out'] = filtered_flags

In [None]:
# === Step 5: Save result ===
output_path = "dataset_with_steam_info_filtered.csv"
df.to_csv(output_path, index=False)
print(f"✅ Done! Saved to {output_path}")