In [72]:
# Install necessary libraries
#install.packages("tidyverse", dependencies = TRUE)
#install.packages("ggthemes", dependencies = TRUE)

In [73]:
suppressPackageStartupMessages({
  library(tidyverse)
  library(ggthemes)
})

In [74]:
data <- read_csv("vgsales.csv", show_col_types=FALSE)

In [75]:
head(data)

Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
2,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
3,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
4,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
5,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37
6,Tetris,GB,1989,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26


In [76]:
filter(data, is.na(as.numeric(Year)))

[1m[22m[36mℹ[39m In argument: `is.na(as.numeric(Year))`.
[33m![39m NAs introduced by coercion"


Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
180,Madden NFL 2004,PS2,,Sports,Electronic Arts,4.26,0.26,0.01,0.71,5.23
378,FIFA Soccer 2004,PS2,,Sports,Electronic Arts,0.59,2.36,0.04,0.51,3.49
432,LEGO Batman: The Videogame,Wii,,Action,Warner Bros. Interactive Entertainment,1.86,1.02,0.00,0.29,3.17
471,wwe Smackdown vs. Raw 2006,PS2,,Fighting,,1.57,1.02,0.00,0.41,3.00
608,Space Invaders,2600,,Shooter,Atari,2.36,0.14,0.00,0.03,2.53
625,Rock Band,X360,,Misc,Electronic Arts,1.93,0.34,0.00,0.21,2.48
650,Frogger's Adventures: Temple of the Frog,GBA,,Adventure,Konami Digital Entertainment,2.15,0.18,0.00,0.07,2.39
653,LEGO Indiana Jones: The Original Adventures,Wii,,Action,LucasArts,1.54,0.63,0.00,0.22,2.39
713,Call of Duty 3,Wii,,Shooter,Activision,1.19,0.84,0.00,0.23,2.26
784,Rock Band,Wii,,Misc,MTV Games,1.35,0.56,0.00,0.20,2.11


In [77]:
data <- data %>%
    mutate(Year = ifelse(Year == "N/A", NA, Year),
           Year = as.numeric(Year))

head(data$Year)

In [78]:
# Check for missing values
colSums(is.na(data))

In [79]:
# Create broader categories for Platform if needed
data <- data %>%
    mutate(cPlatform = case_when(
        Platform %in% c("Wii", "NES", "GB") ~ "Console",
        Platform %in% c("DS", "PSP") ~ "Handheld",
        TRUE ~ "Other"
    )
          )

In [80]:
# Validate Global_Sales matches the sum of regional sales
data <- data %>%
  mutate(Regional_Sales_Sum = NA_Sales + EU_Sales + JP_Sales + Other_Sales,
         Sales_Mismatch = Global_Sales != Regional_Sales_Sum)
colnames(data)

In [81]:
# Count games by genre or platform:
data %>%
  count(Genre) %>%
  arrange(desc(n))

Genre,n
<chr>,<int>
Action,3316
Sports,2346
Misc,1739
Role-Playing,1488
Shooter,1310
Adventure,1286
Racing,1249
Platform,886
Simulation,867
Fighting,848


In [83]:
# Normalize or scale sales if needed for modeling
data <- data %>%
  mutate(
    NA_Sales_Scaled = scale(NA_Sales),
    Global_Sales_Scaled = scale(Global_Sales)
  )