In [2]:
# Import libraries and Dependencies to be used in the project
import pandas as pd
import numpy as np
import requests
import json
import os
from dotenv import load_dotenv

In [3]:
#get private environment varibles from .env file
load_dotenv()
client_id = os.environ['TWITCH_CLIENT_ID']
client_secret = os.environ['TWITCH_CLIENT_SECRET']

In [4]:
#Define the Twitch varibles for a request to the twitch API to get the access token

twitch_url = 'https://id.twitch.tv/oauth2/token'
params = {
    'client_id': client_id,
    'client_secret': client_secret,
    'grant_type': 'client_credentials'
}

In [5]:
#make request to twitch API to get access token and store it in a variable
twitch_request = requests.post(twitch_url, data=params)

In [6]:
#Print the response from the request from Twitch API
print(twitch_request.text)


{"access_token":"4dtsb5b4kdqq3mr8prkggvhm7myaqo","expires_in":5612742,"token_type":"bearer"}



In [7]:
# Print the response from the request from Twitch API in a more readable format using json
print(json.dumps(twitch_request.json(), indent=4))

{
    "access_token": "4dtsb5b4kdqq3mr8prkggvhm7myaqo",
    "expires_in": 5612742,
    "token_type": "bearer"
}


In [8]:
#Save the access token so it can be used in next IGDB API call
bearer_token = twitch_request.json()['access_token']

In [9]:
# Define the IGDB varibles for a request to the IGDB API to get the access token
IGDB_URL = 'https://api.igdb.com/v4/games'

In [10]:
#Function to make API request to IGDB
def make_api_request(base_url, df_name):   
   offset = 0
   all_results = []
   #loop through all the results, 500 at a time since that is the max limit
   while True:
      
      #make request to IGDB API
      IGDB_request = requests.post(base_url, headers={'Client-ID': client_id, 'Authorization': f'Bearer {bearer_token }'}, data=f'fields *; limit 500; offset {offset};')
      IGDB_data = IGDB_request.json()
      
      all_results.extend(IGDB_data)
      
      #if it pulls less than 500 results, then it has reached the end of the data, so break the loop
      if len(IGDB_data) < 500:
         break
      
      offset += len(IGDB_data)
   #Put the results into a pandas normalized dataframe
   df_name = pd.json_normalize(all_results)
   df_name.head()
   return df_name

In [11]:
#Make API request to IGDB to get the data for the video games
VG_DF = make_api_request(IGDB_URL, 'VG_DF')

In [12]:
# Print the column information for the dataframe
VG_DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275217 entries, 0 to 275216
Data columns (total 58 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       275217 non-null  int64  
 1   age_ratings              61429 non-null   object 
 2   alternative_names        61439 non-null   object 
 3   category                 275217 non-null  int64  
 4   cover                    210651 non-null  float64
 5   created_at               275217 non-null  int64  
 6   external_games           256379 non-null  object 
 7   first_release_date       176914 non-null  float64
 8   game_modes               154059 non-null  object 
 9   genres                   224466 non-null  object 
 10  involved_companies       119938 non-null  object 
 11  keywords                 91131 non-null   object 
 12  name                     275217 non-null  object 
 13  platforms                188293 non-null  object 
 14  play

In [13]:
# Print and list the column names for the dataframe
VG_DF.columns

Index(['id', 'age_ratings', 'alternative_names', 'category', 'cover',
       'created_at', 'external_games', 'first_release_date', 'game_modes',
       'genres', 'involved_companies', 'keywords', 'name', 'platforms',
       'player_perspectives', 'release_dates', 'screenshots', 'similar_games',
       'slug', 'storyline', 'summary', 'tags', 'themes', 'updated_at', 'url',
       'videos', 'websites', 'checksum', 'game_localizations', 'collection',
       'multiplayer_modes', 'status', 'language_supports', 'collections',
       'hypes', 'artworks', 'version_parent', 'version_title', 'parent_game',
       'bundles', 'franchises', 'follows', 'rating', 'rating_count',
       'total_rating', 'total_rating_count', 'game_engines',
       'aggregated_rating', 'aggregated_rating_count', 'ports', 'dlcs',
       'remakes', 'franchise', 'expansions', 'standalone_expansions', 'forks',
       'remasters', 'expanded_games'],
      dtype='object')

In [14]:
#Testing to see if the data was pulled correctly
BG3 = VG_DF.loc[VG_DF['name'] == "Baldur's Gate 3"]
BG3

Unnamed: 0,id,age_ratings,alternative_names,category,cover,created_at,external_games,first_release_date,game_modes,genres,...,aggregated_rating_count,ports,dlcs,remakes,franchise,expansions,standalone_expansions,forks,remasters,expanded_games
88047,119171,"[148088, 161963, 162025, 162026, 162027, 162028]","[50217, 78595, 78596, 78597, 139162]",0,289025.0,1559228938,"[1725312, 1775824, 1914473, 2070558, 2678929, ...",1601942000.0,"[1, 2, 3, 4]","[12, 15, 16, 24, 31]",...,17.0,,,,,,,,,


In [15]:
#Testing to make sure the .datetime works on this data (this took forever)
BG3['first_release_date'] = pd.to_datetime(BG3['first_release_date'], infer_datetime_format=True, unit='s')


  BG3['first_release_date'] = pd.to_datetime(BG3['first_release_date'], infer_datetime_format=True, unit='s')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  BG3['first_release_date'] = pd.to_datetime(BG3['first_release_date'], infer_datetime_format=True, unit='s')


In [16]:
# List the dataframe for Baldurs Gate 3
BG3

Unnamed: 0,id,age_ratings,alternative_names,category,cover,created_at,external_games,first_release_date,game_modes,genres,...,aggregated_rating_count,ports,dlcs,remakes,franchise,expansions,standalone_expansions,forks,remasters,expanded_games
88047,119171,"[148088, 161963, 162025, 162026, 162027, 162028]","[50217, 78595, 78596, 78597, 139162]",0,289025.0,1559228938,"[1725312, 1775824, 1914473, 2070558, 2678929, ...",2020-10-06,"[1, 2, 3, 4]","[12, 15, 16, 24, 31]",...,17.0,,,,,,,,,


In [17]:
#make a copy of the dataframe so that the original data is not altered
copy_df = VG_DF.copy()  

In [18]:
#drop any rows that have a NaN value in the 'first_release_date' column
copy_df = copy_df.dropna(subset=['first_release_date'], how='any')

copy_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 176914 entries, 0 to 275216
Data columns (total 58 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       176914 non-null  int64  
 1   age_ratings              54923 non-null   object 
 2   alternative_names        51198 non-null   object 
 3   category                 176914 non-null  int64  
 4   cover                    157144 non-null  float64
 5   created_at               176914 non-null  int64  
 6   external_games           160793 non-null  object 
 7   first_release_date       176914 non-null  float64
 8   game_modes               128009 non-null  object 
 9   genres                   158693 non-null  object 
 10  involved_companies       109941 non-null  object 
 11  keywords                 77192 non-null   object 
 12  name                     176914 non-null  object 
 13  platforms                176914 non-null  object 
 14  player_pe

In [19]:
#convert the 'first_release_date' column to a datetime object
copy_df['first_release_date'] = pd.to_datetime(copy_df['first_release_date'], errors = 'coerce',  unit='s')


In [20]:
# Print and generate a copy of the dataframe
copy_df.head()

Unnamed: 0,id,age_ratings,alternative_names,category,cover,created_at,external_games,first_release_date,game_modes,genres,...,aggregated_rating_count,ports,dlcs,remakes,franchise,expansions,standalone_expansions,forks,remasters,expanded_games
0,131913,[101730],"[40085, 40086, 40087]",0,267633.0,1584788069,[1977891],2016-09-21,[1],[34],...,,,,,,,,,,
1,88308,,,0,64849.0,1519237439,"[243459, 1960113]",2016-11-09,,,...,,,,,,,,,,
2,63308,[14739],,0,78658.0,1505087910,"[12596, 118080, 1185144]",2013-07-03,"[2, 3]",[5],...,,,,,,,,,,
9,231577,"[114607, 114609, 126972, 126973, 126974, 127011]",,0,280467.0,1673878007,"[2639959, 2677882, 2677897, 2677919, 2678595]",2023-02-23,"[1, 2]",[15],...,,,,,,,,,,
10,119025,,,0,,1558701524,"[1723803, 1957721]",2019-06-08,[1],[13],...,,,,,,,,,,


In [21]:
# Define a string variable for the IGDB API request to get the release dates
release_dates_url = 'https://api.igdb.com/v4/release_dates'


In [22]:
#make API request to IGDB to get the data for the release dates
RD_DF = make_api_request(release_dates_url, 'RD_DF')
RD_DF.head()

Unnamed: 0,id,category,created_at,date,game,human,m,platform,region,updated_at,y,checksum,status
0,298,2,1301923501,725846400.0,110,1993,1.0,6,8,1339423931,1993.0,09293025-b7e0-1565-8848-532914a16081,
1,165447,0,1550867412,1561421000.0,115477,"Jun 25, 2019",6.0,6,8,1550867489,2019.0,79b1fc63-2f73-5351-c0f2-6b4fe6cb33fb,
2,143501,0,1517637179,1210118000.0,37605,"May 07, 2008",5.0,5,5,1517864621,2008.0,5114450a-b7e1-c105-a3b9-08ad7b2e4321,
3,240506,0,1619125310,1622160000.0,56914,"May 28, 2021",5.0,48,2,1619163937,2021.0,d4c4a16e-f78d-ae3d-9922-04b3b5a7cdad,
4,310513,0,1631498490,631843200.0,48393,"Jan 09, 1990",1.0,99,5,1631499375,1990.0,530d65c3-cbbe-9a9a-eea0-a999c0e5c61f,


In [23]:
#convert the 'date' column to a datetime object
RD_DF['date'] = pd.to_datetime(RD_DF['date'], errors = 'coerce', unit='s')
RD_DF.head()

Unnamed: 0,id,category,created_at,date,game,human,m,platform,region,updated_at,y,checksum,status
0,298,2,1301923501,1993-01-01,110,1993,1.0,6,8,1339423931,1993.0,09293025-b7e0-1565-8848-532914a16081,
1,165447,0,1550867412,2019-06-25,115477,"Jun 25, 2019",6.0,6,8,1550867489,2019.0,79b1fc63-2f73-5351-c0f2-6b4fe6cb33fb,
2,143501,0,1517637179,2008-05-07,37605,"May 07, 2008",5.0,5,5,1517864621,2008.0,5114450a-b7e1-c105-a3b9-08ad7b2e4321,
3,240506,0,1619125310,2021-05-28,56914,"May 28, 2021",5.0,48,2,1619163937,2021.0,d4c4a16e-f78d-ae3d-9922-04b3b5a7cdad,
4,310513,0,1631498490,1990-01-09,48393,"Jan 09, 1990",1.0,99,5,1631499375,1990.0,530d65c3-cbbe-9a9a-eea0-a999c0e5c61f,


In [24]:
#remove unnecessary columns
dates_ids = RD_DF[['id', 'date']]
dates_ids.head()

Unnamed: 0,id,date
0,298,1993-01-01
1,165447,2019-06-25
2,143501,2008-05-07
3,240506,2021-05-28
4,310513,1990-01-09


In [25]:
#rename columns
dates_ids = dates_ids.rename(columns={'id': 'release_date_id', 'date':'release_date_data' })
dates_ids.head()

Unnamed: 0,release_date_id,release_date_data
0,298,1993-01-01
1,165447,2019-06-25
2,143501,2008-05-07
3,240506,2021-05-28
4,310513,1990-01-09


In [51]:
#merge the two dataframes, 'copy_df' and 'dates_ids', on the 'release_dates' column, which is a list of release date ids
merged_df = copy_df.explode('release_dates').merge(dates_ids, left_on='release_dates', right_on='release_date_id', how='left')
merged_df[['name', 'first_release_date', 'release_date_data', 'release_dates', 'release_date_id']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348221 entries, 0 to 348220
Data columns (total 5 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   name                348221 non-null  object        
 1   first_release_date  348220 non-null  datetime64[ns]
 2   release_date_data   343995 non-null  datetime64[ns]
 3   release_dates       348221 non-null  object        
 4   release_date_id     348221 non-null  int64         
dtypes: datetime64[ns](2), int64(1), object(2)
memory usage: 13.3+ MB


In [52]:
#drop the 'release_dates' column
merged_df['release_dates'] = merged_df['release_date_data']
merged_df = merged_df.drop(columns=['release_date_data', 'release_date_id'])


In [53]:
merged_df[['name', 'release_dates']].head()

Unnamed: 0,name,release_dates
0,Maji Kyun! Renaissance,2016-09-21
1,Hey Duggee: The Big Outdoor App,2016-11-09
2,Soldier Front 2,2013-07-03
3,Blood Bowl 3: Black Orcs Edition,2023-02-23
4,Blood Bowl 3: Black Orcs Edition,2023-02-23


In [54]:
# Keep all columns for now
all_columns = ['id', 'age_ratings', 'alternative_names', 'category', 'cover',
       'created_at', 'external_games', 'first_release_date', 'game_modes',
       'genres', 'involved_companies', 'keywords', 'name', 'platforms',
       'player_perspectives', 'release_dates', 'screenshots', 'similar_games',
       'slug', 'storyline', 'summary', 'tags', 'themes', 'updated_at', 'url',
       'videos', 'websites', 'checksum', 'game_localizations', 'collection',
       'multiplayer_modes', 'status', 'language_supports', 'collections',
       'hypes', 'artworks', 'version_parent', 'version_title', 'parent_game',
       'bundles', 'franchises', 'follows', 'rating', 'rating_count',
       'total_rating', 'total_rating_count', 'game_engines',
       'aggregated_rating', 'aggregated_rating_count', 'ports', 'dlcs',
       'remakes', 'franchise', 'expansions', 'standalone_expansions', 'forks',
       'remasters', 'expanded_games']

In [55]:
#combine rows on the name so that the 'release dates' is a list
# Group by 'name' and aggregate 'first_release_date' and 'release_dates' into lists
merged_df = merged_df[all_columns]
merged_df = merged_df.groupby('name', as_index=False).agg({
    'id': 'first',
    'age_ratings': 'first',
    'alternative_names': 'first',
    'category': 'first',
    'cover': 'first',
    'created_at': 'first',
    'external_games': 'first',
    'first_release_date': 'first',
    'game_modes': 'first',
    'genres': 'first',
    'involved_companies': 'first',
    'keywords': 'first',
    'platforms': 'first',
    'player_perspectives': 'first',
    'release_dates': list,
    'screenshots': 'first',
    'similar_games': 'first',
    'slug': 'first',
    'storyline': 'first',
    'summary': 'first',
    'tags': 'first',
    'themes': 'first',
    'updated_at': 'first',
    'url': 'first',
    'videos': 'first',
    'websites': 'first',
    'checksum': 'first',
    'game_localizations': 'first',
    'collection': 'first',
    'multiplayer_modes': 'first',
    'status': 'first',
    'language_supports': 'first',
    'collections': 'first',
    'hypes': 'first',
    'artworks': 'first',
    'version_parent': 'first',
    'version_title': 'first',
    'parent_game': 'first',
    'bundles': 'first',
    'franchises': 'first',
    'follows': 'first',
    'rating': 'first',
    'rating_count': 'first',
    'total_rating': 'first',
    'total_rating_count': 'first',
    'game_engines': 'first',
    'aggregated_rating': 'first',
    'aggregated_rating_count': 'first',
    'ports': 'first',
    'dlcs': 'first',
    'remakes': 'first',
    'franchise': 'first',
    'expansions': 'first',
    'standalone_expansions': 'first',
    'forks': 'first',
    'remasters': 'first',
    'expanded_games': 'first'
})

In [56]:
#Testing to see if the data was merged correctly It is working now
BG3_2 = merged_df.loc[merged_df['name'] == "Baldur's Gate 3"]
BG3_2[['release_dates', 'first_release_date', 'name']].head()

Unnamed: 0,release_dates,first_release_date,name
13628,"[2023-09-06 00:00:00, 2023-08-03 00:00:00, 202...",2020-10-06,Baldur's Gate 3


In [57]:
BG3_2.head()

Unnamed: 0,name,id,age_ratings,alternative_names,category,cover,created_at,external_games,first_release_date,game_modes,...,aggregated_rating_count,ports,dlcs,remakes,franchise,expansions,standalone_expansions,forks,remasters,expanded_games
13628,Baldur's Gate 3,119171,"[148088, 161963, 162025, 162026, 162027, 162028]","[50217, 78595, 78596, 78597, 139162]",0,289025.0,1559228938,"[1725312, 1775824, 1914473, 2070558, 2678929, ...",2020-10-06,"[1, 2, 3, 4]",...,17.0,,,,,,,,,


In [58]:
genre_url = 'https://api.igdb.com/v4/genres'
genre_df = make_api_request(genre_url, 'genre_df')

In [59]:
genre_df.head()

Unnamed: 0,id,created_at,name,slug,updated_at,url,checksum
0,2,1297639288,Point-and-click,point-and-click,1323382086,https://www.igdb.com/genres/point-and-click,47a23b47-ad82-9d36-517a-41b23073a58f
1,4,1297639288,Fighting,fighting,1323289215,https://www.igdb.com/genres/fighting,f64573d0-92ea-1298-1f2e-5b6a4b309eb5
2,5,1297639288,Shooter,shooter,1323289215,https://www.igdb.com/genres/shooter,d085fd73-9514-e857-cb35-7cd5b6e952b1
3,7,1297639288,Music,music,1323289215,https://www.igdb.com/genres/music,bb04b3e2-66b3-1ff4-bef6-464346f4ffd9
4,8,1297639288,Platform,platform,1323289215,https://www.igdb.com/genres/platform,ff5b1711-9f9a-fec6-f391-f3b00a9682c1


In [60]:
genre_df.drop(columns=['slug','updated_at','url','checksum','created_at'], inplace=True)
genre_df.rename(columns={'id':'genre_id', 'name':'genre_name'}, inplace=True)
genre_df

Unnamed: 0,genre_id,genre_name
0,2,Point-and-click
1,4,Fighting
2,5,Shooter
3,7,Music
4,8,Platform
5,9,Puzzle
6,10,Racing
7,11,Real Time Strategy (RTS)
8,12,Role-playing (RPG)
9,13,Simulator


In [61]:
# Merge genre_df with merged_df to replace genre_id with genre_name
genre_merged_df = merged_df.explode('genres').merge(genre_df, left_on='genres', right_on='genre_id', how='left')

# Drop the old 'genres' column and rename the 'genre_name' column
genre_merged_df.drop(columns=['genres'], inplace=True)
genre_merged_df.rename(columns={'genre_name': 'genres'}, inplace=True)

genre_merged_df[['name', 'genres']].head()

Unnamed: 0,name,genres
0,!,Point-and-click
1,!,Puzzle
2,!,Adventure
3,!Dead Pixels Adventure!,Adventure
4,!Dead Pixels Adventure!,Indie


In [63]:
BG3_4 = genre_merged_df.loc[genre_merged_df['name']=="Baldur's Gate 3"]
BG3_4

Unnamed: 0,name,id,age_ratings,alternative_names,category,cover,created_at,external_games,first_release_date,game_modes,...,dlcs,remakes,franchise,expansions,standalone_expansions,forks,remasters,expanded_games,genre_id,genres
25422,Baldur's Gate 3,119171,"[148088, 161963, 162025, 162026, 162027, 162028]","[50217, 78595, 78596, 78597, 139162]",0,289025.0,1559228938,"[1725312, 1775824, 1914473, 2070558, 2678929, ...",2020-10-06,"[1, 2, 3, 4]",...,,,,,,,,,12.0,Role-playing (RPG)
25423,Baldur's Gate 3,119171,"[148088, 161963, 162025, 162026, 162027, 162028]","[50217, 78595, 78596, 78597, 139162]",0,289025.0,1559228938,"[1725312, 1775824, 1914473, 2070558, 2678929, ...",2020-10-06,"[1, 2, 3, 4]",...,,,,,,,,,15.0,Strategy
25424,Baldur's Gate 3,119171,"[148088, 161963, 162025, 162026, 162027, 162028]","[50217, 78595, 78596, 78597, 139162]",0,289025.0,1559228938,"[1725312, 1775824, 1914473, 2070558, 2678929, ...",2020-10-06,"[1, 2, 3, 4]",...,,,,,,,,,16.0,Turn-based strategy (TBS)
25425,Baldur's Gate 3,119171,"[148088, 161963, 162025, 162026, 162027, 162028]","[50217, 78595, 78596, 78597, 139162]",0,289025.0,1559228938,"[1725312, 1775824, 1914473, 2070558, 2678929, ...",2020-10-06,"[1, 2, 3, 4]",...,,,,,,,,,24.0,Tactical
25426,Baldur's Gate 3,119171,"[148088, 161963, 162025, 162026, 162027, 162028]","[50217, 78595, 78596, 78597, 139162]",0,289025.0,1559228938,"[1725312, 1775824, 1914473, 2070558, 2678929, ...",2020-10-06,"[1, 2, 3, 4]",...,,,,,,,,,31.0,Adventure


In [64]:
#combine rows on the name so that the 'genres' is a list
# Group by 'name' and aggregate 'genres' into lists

genre_merged_df = genre_merged_df.groupby('name', as_index=False).agg({
    'id': 'first',
    'age_ratings': 'first',
    'alternative_names': 'first',
    'category': 'first',
    'cover': 'first',
    'created_at': 'first',
    'external_games': 'first',
    'first_release_date': 'first',
    'game_modes': 'first',
    'genres': list,
    'involved_companies': 'first',
    'keywords': 'first',
    'platforms': 'first',
    'player_perspectives': 'first',
    'release_dates': list,
    'screenshots': 'first',
    'similar_games': 'first',
    'slug': 'first',
    'storyline': 'first',
    'summary': 'first',
    'tags': 'first',
    'themes': 'first',
    'updated_at': 'first',
    'url': 'first',
    'videos': 'first',
    'websites': 'first',
    'checksum': 'first',
    'game_localizations': 'first',
    'collection': 'first',
    'multiplayer_modes': 'first',
    'status': 'first',
    'language_supports': 'first',
    'collections': 'first',
    'hypes': 'first',
    'artworks': 'first',
    'version_parent': 'first',
    'version_title': 'first',
    'parent_game': 'first',
    'bundles': 'first',
    'franchises': 'first',
    'follows': 'first',
    'rating': 'first',
    'rating_count': 'first',
    'total_rating': 'first',
    'total_rating_count': 'first',
    'game_engines': 'first',
    'aggregated_rating': 'first',
    'aggregated_rating_count': 'first',
    'ports': 'first',
    'dlcs': 'first',
    'remakes': 'first',
    'franchise': 'first',
    'expansions': 'first',
    'standalone_expansions': 'first',
    'forks': 'first',
    'remasters': 'first',
    'expanded_games': 'first'
})

In [68]:
BG3_5 = genre_merged_df.loc[genre_merged_df['name']=="Baldur's Gate 3"]
BG3_5[['name','release_dates','genres']].head()

Unnamed: 0,name,release_dates,genres
13628,Baldur's Gate 3,"[[2023-09-06 00:00:00, 2023-08-03 00:00:00, 20...","[Role-playing (RPG), Strategy, Turn-based stra..."


In [69]:
#output the data to a csv file
genre_merged_df.to_csv('../Uncleaned Data/uncleaned_video_games.csv', index=False)

In [70]:
#read the data back in to make sure it was saved correctly
Video_games_df = pd.read_csv('../Uncleaned Data/uncleaned_video_games.csv')
Video_games_df.columns

Index(['name', 'id', 'age_ratings', 'alternative_names', 'category', 'cover',
       'created_at', 'external_games', 'first_release_date', 'game_modes',
       'genres', 'involved_companies', 'keywords', 'platforms',
       'player_perspectives', 'release_dates', 'screenshots', 'similar_games',
       'slug', 'storyline', 'summary', 'tags', 'themes', 'updated_at', 'url',
       'videos', 'websites', 'checksum', 'game_localizations', 'collection',
       'multiplayer_modes', 'status', 'language_supports', 'collections',
       'hypes', 'artworks', 'version_parent', 'version_title', 'parent_game',
       'bundles', 'franchises', 'follows', 'rating', 'rating_count',
       'total_rating', 'total_rating_count', 'game_engines',
       'aggregated_rating', 'aggregated_rating_count', 'ports', 'dlcs',
       'remakes', 'franchise', 'expansions', 'standalone_expansions', 'forks',
       'remasters', 'expanded_games'],
      dtype='object')

In [71]:
#drop unnecessary columns from the dataframe that are not needed for the analysis
cols_to_delete = ['age_ratings', 'alternative_names','cover','created_at', 'external_games','game_modes','involved_companies', 'keywords','platforms',
       'player_perspectives','screenshots', 'similar_games',
       'slug', 'storyline', 'summary', 'tags', 'themes', 'updated_at', 'url',
       'videos', 'websites', 'checksum', 'game_localizations', 'collection',
       'multiplayer_modes', 'status', 'language_supports', 'collections',
       'hypes', 'artworks', 'version_parent', 'version_title', 'parent_game',
       'bundles', 'franchises', 'follows','game_engines','ports', 'dlcs',
       'remakes', 'franchise', 'expansions', 'standalone_expansions', 'forks',
       'remasters', 'expanded_games']
cleaned_df = Video_games_df.drop(columns=cols_to_delete)
cleaned_df.head()

Unnamed: 0,name,id,category,first_release_date,genres,release_dates,rating,rating_count,total_rating,total_rating_count,aggregated_rating,aggregated_rating_count
0,!,165498,0,2009-12-31,"['Point-and-click', 'Puzzle', 'Adventure']","[[Timestamp('2009-12-31 00:00:00')], [Timestam...",,,,,,
1,!Dead Pixels Adventure!,103013,0,2018-06-12,"['Adventure', 'Indie']","[[Timestamp('2018-06-12 00:00:00')], [Timestam...",,,,,,
2,!Pattern,225575,0,2022-11-05,"['Puzzle', 'Indie']","[[Timestamp('2022-11-05 00:00:00')], [Timestam...",,,,,,
3,"""Draw a card"" Simulator",168659,0,2020-08-26,['Simulator'],[[Timestamp('2020-08-26 00:00:00')]],,,,,,
4,"""Gamer""+ Crossroad of Sympathy",186065,0,2021-01-21,"['Indie', 'Visual Novel']","[[Timestamp('2021-01-21 00:00:00'), Timestamp(...",,,,,,


In [72]:
# Print the column information for the dataframe
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168437 entries, 0 to 168436
Data columns (total 12 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   name                     168436 non-null  object 
 1   id                       168437 non-null  int64  
 2   category                 168437 non-null  int64  
 3   first_release_date       168436 non-null  object 
 4   genres                   168437 non-null  object 
 5   release_dates            168437 non-null  object 
 6   rating                   28526 non-null   float64
 7   rating_count             28526 non-null   float64
 8   total_rating             33160 non-null   float64
 9   total_rating_count       33160 non-null   float64
 10  aggregated_rating        14603 non-null   float64
 11  aggregated_rating_count  14603 non-null   float64
dtypes: float64(6), int64(2), object(4)
memory usage: 15.4+ MB


In [73]:

#drop null values from rating column
cleaned_df = cleaned_df.dropna(subset=['rating'])

cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28526 entries, 12 to 168393
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   name                     28526 non-null  object 
 1   id                       28526 non-null  int64  
 2   category                 28526 non-null  int64  
 3   first_release_date       28526 non-null  object 
 4   genres                   28526 non-null  object 
 5   release_dates            28526 non-null  object 
 6   rating                   28526 non-null  float64
 7   rating_count             28526 non-null  float64
 8   total_rating             28526 non-null  float64
 9   total_rating_count       28526 non-null  float64
 10  aggregated_rating        9969 non-null   float64
 11  aggregated_rating_count  9969 non-null   float64
dtypes: float64(6), int64(2), object(4)
memory usage: 2.8+ MB


In [74]:
# Print the column information for the dataframe
cleaned_df.head(50)

Unnamed: 0,name,id,category,first_release_date,genres,release_dates,rating,rating_count,total_rating,total_rating_count,aggregated_rating,aggregated_rating_count
12,"""Two Draw""",74340,0,2017-10-19,"['Strategy', 'Indie']","[[Timestamp('2017-10-19 00:00:00')], [Timestam...",70.0,0.0,70.0,0.0,,
23,#Archery,32298,0,2017-03-30,"['Sport', 'Indie']","[[Timestamp('2017-03-30 00:00:00')], [Timestam...",50.0,0.0,50.0,0.0,,
25,#Breakforcist Battle,96199,11,2018-04-12,"['Puzzle', 'Indie', 'Arcade']","[[Timestamp('2018-04-12 00:00:00'), Timestamp(...",63.0,0.0,63.0,0.0,,
32,#IDARB,9888,0,2015-01-30,"['Platform', 'Sport', 'Indie']","[[Timestamp('2015-01-30 00:00:00'), Timestamp(...",77.953028,8.0,78.101514,12.0,78.25,4.0
33,#Killallzombies,26369,0,2014-10-28,"['Shooter', 'Indie', 'Arcade']","[[Timestamp('2014-10-28 00:00:00'), Timestamp(...",70.0,2.0,67.5,3.0,65.0,1.0
39,#SelfieTennis,34683,0,2016-04-01,"['Sport', 'Indie']","[[Timestamp('2016-04-01 00:00:00'), NaT], [Tim...",60.0,0.0,60.0,0.0,,
42,#WarGames,76767,0,2018-03-14,"['Simulator', 'Adventure', 'Indie']","[[Timestamp('2018-12-31 00:00:00'), Timestamp(...",50.0,0.0,50.0,0.0,,
45,$1 Ride,31922,0,2016-07-28,"['Shooter', 'Indie']","[[Timestamp('2016-07-28 00:00:00')], [Timestam...",50.0,0.0,50.0,0.0,,
62,'n Verlore Verstand,19055,0,2016-04-05,"['Platform', 'Puzzle', 'Adventure', 'Indie']","[[Timestamp('2016-04-05 00:00:00'), Timestamp(...",70.0,0.0,70.0,0.0,,
85,.Detuned,7724,0,2009-09-17,"['Music', 'Indie']","[[Timestamp('2009-09-17 00:00:00'), Timestamp(...",21.584797,8.0,35.792399,9.0,50.0,1.0


In [75]:
#testing to see if the data was cleaned correctly
BG3_3 = cleaned_df.loc[cleaned_df['name'] == "Baldur's Gate 3"]
BG3_3

Unnamed: 0,name,id,category,first_release_date,genres,release_dates,rating,rating_count,total_rating,total_rating_count,aggregated_rating,aggregated_rating_count
13628,Baldur's Gate 3,119171,0,2020-10-06,"['Role-playing (RPG)', 'Strategy', 'Turn-based...","[[Timestamp('2023-09-06 00:00:00'), Timestamp(...",95.201398,389.0,94.850699,406.0,94.5,17.0


In [76]:
#output the cleaned data to a csv file
cleaned_df.to_csv('../Cleaned Data/video_games_ratings.csv', index=False)