In [16]:
import pandas as pd
#pandas is a data manipulation library
import os
#os is a library that allows you to interact with the operating system
import re
#re is a library that allows you to use regular expressions
import glob
#glob is a library that allows you to use Unix style pathname pattern expansion
import time
#time is a library that allows you to work with time


In [18]:

# Get the list of all CSV files in the specified directory
csv_files = glob.glob(r'..\data\raw\150 best games list\*.csv')

# Load all CSV files into a single dataframe
df_list = [pd.read_csv(file) for file in csv_files]
df = pd.concat(df_list, ignore_index=True)

# Preview the dataframe
df.head()

Unnamed: 0,name,firstReleaseDate,copiesSold,price,revenue,avgPlaytime,reviewScore,publisherClass,publishers,developers,steamId
0,Homeworld 3,1715572800000,101336,59.99,5454092.0,8.9526,42,AA,Gearbox Publishing,Blackbird Interactive,1840080
1,Crime Scene Cleaner,1723593600000,309232,19.99,5082397.0,8.567752,98,Indie,"President Studio,PlayWay S.A.",President Studio,1040200
2,Senua’s Saga: Hellblade II,1716264000000,108108,49.99,4796864.0,5.133335,88,AAA,Xbox Game Studios,Ninja Theory,2461850
3,Pixel Gun 3D: PC Edition,1712030400000,1685776,0.0,4775621.0,11.510063,62,Indie,Cubic Games Studio,Cubic Games Studio,2524890
4,shapez 2,1723680000000,256118,24.99,4763491.0,18.428244,99,Indie,"tobspr Games,Gamera Games",tobspr Games,2162800


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              150 non-null    object 
 1   firstReleaseDate  150 non-null    int64  
 2   copiesSold        150 non-null    int64  
 3   price             150 non-null    float64
 4   revenue           150 non-null    float64
 5   avgPlaytime       150 non-null    float64
 6   reviewScore       150 non-null    int64  
 7   publisherClass    150 non-null    object 
 8   publishers        150 non-null    object 
 9   developers        150 non-null    object 
 10  steamId           150 non-null    int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 13.0+ KB


In [5]:
# Updated function to clean game names
# Replace Roman numerals and remove unwanted characters

# Roman to Arabic numeral mapping
roman_to_arabic = {
    'I': '1', 'II': '2', 'III': '3', 'IV': '4', 'V': '5',
    'VI': '6', 'VII': '7', 'VIII': '8', 'IX': '9', 'X': '10',
    'XI': '11', 'XII': '12', 'XIII': '13', 'XIV': '14', 'XV': '15',
}

def clean_game_name(name):
    # Replace Roman numerals with Arabic numerals
    for roman, arabic in roman_to_arabic.items():
        name = re.sub(rf'\b{roman}\b', arabic, name, flags=re.IGNORECASE)
    
    # Remove non-alphanumeric characters (excluding spaces)
    name = re.sub(r'[^A-Za-z0-9 ]+', '', name)
    
    # Remove extra spaces
    name = name.strip()
    
    return name

# Apply the cleaning function to the 'name' column
df['cleaned_name'] = df['name'].apply(clean_game_name)

# Preview cleaned data
df[['name', 'cleaned_name']].head()


Unnamed: 0,name,cleaned_name
0,Homeworld 3,Homeworld 3
1,Crime Scene Cleaner,Crime Scene Cleaner
2,Senua’s Saga: Hellblade II,Senuas Saga Hellblade 2
3,Pixel Gun 3D: PC Edition,Pixel Gun 3D PC Edition
4,shapez 2,shapez 2


In [13]:
df.head()

Unnamed: 0,name,firstReleaseDate,copiesSold,price,revenue,avgPlaytime,reviewScore,publisherClass,publishers,developers,steamId
0,Homeworld 3,2024-05-13,101336,59.99,5454092.0,8.9526,42,AA,Gearbox Publishing,Blackbird Interactive,1840080
1,Crime Scene Cleaner,2024-08-14,309232,19.99,5082397.0,8.567752,98,Indie,"President Studio,PlayWay S.A.",President Studio,1040200
2,Senuas Saga Hellblade 2,2024-05-21,108108,49.99,4796864.0,5.133335,88,AAA,Xbox Game Studios,Ninja Theory,2461850
3,Pixel Gun 3D PC Edition,2024-04-02,1685776,0.0,4775621.0,11.510063,62,Indie,Cubic Games Studio,Cubic Games Studio,2524890
4,shapez 2,2024-08-15,256118,24.99,4763491.0,18.428244,99,Indie,"tobspr Games,Gamera Games",tobspr Games,2162800


In [7]:
# Step 1: Drop the original 'name' column
df = df.drop(columns=['name'])

# Step 2: Move 'cleaned_name' to the first column and rename it to 'name'
df = df[['cleaned_name'] + [col for col in df.columns if col != 'cleaned_name']]
df = df.rename(columns={'cleaned_name': 'name'})

# Preview the updated dataframe to ensure changes
df.head()


Unnamed: 0,name,firstReleaseDate,copiesSold,price,revenue,avgPlaytime,reviewScore,publisherClass,publishers,developers,steamId
0,Homeworld 3,1715572800000,101336,59.99,5454092.0,8.9526,42,AA,Gearbox Publishing,Blackbird Interactive,1840080
1,Crime Scene Cleaner,1723593600000,309232,19.99,5082397.0,8.567752,98,Indie,"President Studio,PlayWay S.A.",President Studio,1040200
2,Senuas Saga Hellblade 2,1716264000000,108108,49.99,4796864.0,5.133335,88,AAA,Xbox Game Studios,Ninja Theory,2461850
3,Pixel Gun 3D PC Edition,1712030400000,1685776,0.0,4775621.0,11.510063,62,Indie,Cubic Games Studio,Cubic Games Studio,2524890
4,shapez 2,1723680000000,256118,24.99,4763491.0,18.428244,99,Indie,"tobspr Games,Gamera Games",tobspr Games,2162800


In [10]:
# Step 3: Sort by the newly renamed 'name' column
df_sorted = df.sort_values('name')

# Define the path where you want to create the folders
base_path = r'../data/raw/game lib'

# Step 4: Create folders for each game using the cleaned and renamed 'name' column
for game_name in df_sorted['name']:
    # Define the folder path
    folder_path = os.path.join(base_path, game_name)
    
    # Create the directory if it does not exist
    os.makedirs(folder_path, exist_ok=True)

print("Folders created successfully.")


Folders created successfully.


In [12]:
# Convert 'firstReleaseDate' from milliseconds to a proper date, excluding weekday and hour
df['firstReleaseDate'] = pd.to_datetime(df['firstReleaseDate'], unit='ms').dt.strftime('%Y-%m-%d')

# Preview the dataframe to ensure the changes
df[['name', 'firstReleaseDate']].head()


Unnamed: 0,name,firstReleaseDate
0,Homeworld 3,2024-05-13
1,Crime Scene Cleaner,2024-08-14
2,Senuas Saga Hellblade 2,2024-05-21
3,Pixel Gun 3D PC Edition,2024-04-02
4,shapez 2,2024-08-15


In [14]:
df.to_csv('../data/interim/main_list.csv', index=False)
#saving the cleaned data to a csv file 