In [None]:
import pandas as pd
import requests
from rapidfuzz import process, fuzz

# === Step 1: Load your dataset ===
csv_path = "D:/hp/Documents/CDS513/Assignment/steam-200k.csv"  
df = pd.read_csv(csv_path, header = None, 
                        names=["Player_ID", "Name", "Behavior", "Hours", "Ignore"])

In [7]:
df.head()

Unnamed: 0,Player_ID,Name,Behavior,Hours,Ignore
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0


In [8]:
game_names = df['Name'].astype(str).tolist()

# === Step 2: Get all Steam apps ===
print("Downloading Steam app list...")
applist_url = "https://api.steampowered.com/ISteamApps/GetAppList/v2/"
response = requests.get(applist_url)
all_apps = response.json()['applist']['apps']
app_df = pd.DataFrame(all_apps)

# === Step 3: Fuzzy match each game name ===
print("Matching game names...")
matched_names = []
matched_ids = []
match_scores = []

for name in game_names:
    match, score, idx = process.extractOne(
        name, app_df['name'], scorer=fuzz.token_sort_ratio
    )
    matched_names.append(match)
    matched_ids.append(app_df.iloc[idx]['appid'])
    match_scores.append(score)

# === Step 4: Add results to original DataFrame ===
df['matched_name'] = matched_names
df['appid'] = matched_ids
df['match_score'] = match_scores

# === Step 5: Save to a new CSV ===
output_path = "D:/hp/Documents/CDS513/Assignment/dataset_with_appids.csv"
df.to_csv(output_path, index=False)
print(f"✅ Done! Updated dataset saved to: {output_path}")


Downloading Steam app list...
Matching game names...


KeyboardInterrupt: 

In [9]:
game_names = df['Name'].astype(str).tolist()

In [10]:
# === Step 2: Get all Steam apps ===
print("Downloading Steam app list...")
applist_url = "https://api.steampowered.com/ISteamApps/GetAppList/v2/"
response = requests.get(applist_url)
all_apps = response.json()['applist']['apps']
app_df = pd.DataFrame(all_apps)

Downloading Steam app list...


In [13]:
app_df.shape

(243346, 2)

In [15]:
# === Step 3: Fuzzy match ===
print("Matching game names...")
matched_names, matched_ids, match_scores = [], [], []

i=1

for name in game_names:
    match, score, idx = process.extractOne(
        name, app_df['name'], scorer=fuzz.token_sort_ratio
    )
    matched_names.append(match)
    matched_ids.append(app_df.iloc[idx]['appid'])
    match_scores.append(score)
    print("Current progress: ", i, "/", len(game_names))
    i=i+1

df['matched_name'] = matched_names
df['appid'] = matched_ids
df['match_score'] = match_scores

Matching game names...
Current progress:  1 / 200000
Current progress:  2 / 200000
Current progress:  3 / 200000
Current progress:  4 / 200000
Current progress:  5 / 200000
Current progress:  6 / 200000
Current progress:  7 / 200000
Current progress:  8 / 200000
Current progress:  9 / 200000
Current progress:  10 / 200000
Current progress:  11 / 200000
Current progress:  12 / 200000
Current progress:  13 / 200000
Current progress:  14 / 200000
Current progress:  15 / 200000
Current progress:  16 / 200000
Current progress:  17 / 200000
Current progress:  18 / 200000
Current progress:  19 / 200000
Current progress:  20 / 200000
Current progress:  21 / 200000
Current progress:  22 / 200000
Current progress:  23 / 200000
Current progress:  24 / 200000
Current progress:  25 / 200000
Current progress:  26 / 200000
Current progress:  27 / 200000
Current progress:  28 / 200000
Current progress:  29 / 200000
Current progress:  30 / 200000
Current progress:  31 / 200000
Current progress:  32 / 2

KeyboardInterrupt: 

In [None]:
import time
import re 

# === Step 4: Get genres and release year ===
print("Fetching genres & release years...")
genres = []
release_years = []
filtered = []

for appid in df['appid']:
    url = f"https://store.steampowered.com/api/appdetails?appids={appid}"
    try:
        res = requests.get(url)
        data = res.json().get(str(appid), {}).get('data', {})

        # Get release year
        release_date = data.get('release_date', {}).get('date', '')
        match = re.search(r'\b(\d{4})\b', release_date)
        year = int(match.group(1)) if match else None

        # Filter by release year
        if year and year < 2018:
            genre_list = data.get('genres', [])
            genre_names = [g['description'] for g in genre_list]
            genres.append(", ".join(genre_names))
            release_years.append(year)
            filtered.append(False)
        else:
            genres.append(None)
            release_years.append(year)
            filtered.append(True)
    except Exception as e:
        genres.append(None)
        release_years.append(None)
        filtered.append(True)
    time.sleep(0.3)

df['release_year'] = release_years
df['genres'] = genres
df['filtered_out'] = filtered  # True if released after 2019 or failed

# === Step 5: Save result ===
output_path = "dataset_with_steam_info_filtered.csv"
df.to_csv(output_path, index=False)
print(f"✅ Done! Saved to {output_path}")
