In [65]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [66]:
# Read in the CSV file as a Pandas DataFrame
videogame_df = pd.read_csv(
    Path("Resources/video_game_sales.csv")
)

# Review the DataFrame
videogame_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [67]:
videogame_df_renamed_columns = videogame_df.rename(columns = {'NA_Sales': 'North America', 'EU_Sales':'EU', 'JP_Sales':'Japan', 'Other_Sales':'Other', 'Global_Sales':'Global'})
videogame_df_renamed_columns

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,North America,EU,Japan,Other,Global
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


In [68]:
# Unpivot Sales Columns
videogame_df_unpivoted = pd.melt(videogame_df_renamed_columns, id_vars=['Rank','Name','Platform','Year','Genre','Publisher'], var_name = 'Region', value_name = 'Sales')
videogame_df_unpivoted

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,Region,Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,North America,41.49
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,North America,29.08
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,North America,15.85
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,North America,15.75
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,North America,11.27
...,...,...,...,...,...,...,...,...
82985,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,Global,0.01
82986,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,Global,0.01
82987,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,Global,0.01
82988,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,Global,0.01


In [75]:
videogame_df_unpivoted.describe()

Unnamed: 0,Rank,Year,Sales
count,82990.0,81635.0,82990.0
mean,8300.605254,2006.406443,0.214921
std,4791.73845,5.828838,0.851977
min,1.0,1980.0,0.0
25%,4151.0,2003.0,0.0
50%,8300.5,2007.0,0.03
75%,12450.0,2010.0,0.15
max,16600.0,2020.0,82.74


In [78]:
grouped_videogame_df = videogame_df_unpivoted.groupby(['Year'])['Sales'].sum().reset_index()
grouped_videogame_df['Count of Games Released'] = videogame_df_unpivoted['Year'].map(videogame_df_unpivoted['Year'].value_counts())
grouped_videogame_df

Unnamed: 0,Year,Sales,Count of Games Released
0,1980.0,22.76,5040.0
1,1981.0,71.45,70.0
2,1982.0,57.74,7140.0
3,1983.0,33.59,7155.0
4,1984.0,100.71,1315.0
5,1985.0,107.89,85.0
6,1986.0,74.15,5040.0
7,1987.0,43.44,5040.0
8,1988.0,94.43,7155.0
9,1989.0,146.9,70.0


In [26]:
# # Scale Global_Sales and Age of Games
# sales_data_scaled = StandardScaler().fit_transform(
#     videogame_dropped_columns_df[["Age of Game","Global_Sales"]])

In [46]:
# # Scale price data, return, and variance values
# df_sales_data_scaled = pd.DataFrame(
#     sales_data_scaled,
#     columns=["Global_Sales","Age of Game"]
# )

# # Copy the names from the original data
# df_sales_data_scaled["Name"] = videogame_df["Name"]

# # Set the name column as index
# df_sales_data_scaled = df_sales_data_scaled.set_index("Name")

# # Display sample data
# df_sales_data_scaled.head()

In [79]:
sales_plot = grouped_videogame_df.hvplot.scatter(
    x="Sales",
    y="Count of Games Released",
    title="Expected Sales Based on Count of Games Released"
)
sales_plot

In [80]:
# Reformat data of the independent variable X as a single-column array
X = grouped_videogame_df["Count of Games Released"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[5040.],
       [  70.],
       [7140.],
       [7155.],
       [1315.]])

In [81]:
# The shape
X.shape

(39, 1)

In [82]:
# Create an array for the dependent variable y
y = grouped_videogame_df["Sales"]

In [83]:
# Create a model with scikit-learn
model = LinearRegression()

In [84]:
# Fit the data into the model
model.fit(X, y)

LinearRegression()

In [85]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [-0.02358061]


In [86]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 543.5977184201705


In [87]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 543.5977184201705 + -0.023580608147055152X


In [88]:
# Display the formula to predict the sales for a game released 7 years ago
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 7")

# Predict the sales for a game that was released 7 years ago
y_7 = model.intercept_ + model.coef_[0] * 7

# Display the prediction
print(f"Predicted sales for number of games released: ${y_7:.2f}")

Model's formula: y = 543.5977184201705 + -0.023580608147055152 * 7
Predicted sales for number of games released: $543.43


In [89]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [90]:
# Create a copy of the original data
df_sales_predicted = grouped_videogame_df.copy()

# Add a column with the predicted salary values
df_sales_predicted["sales_predicted"] = predicted_y_values

# Display sample data
df_sales_predicted.head()

Unnamed: 0,Year,Sales,Count of Games Released,sales_predicted
0,1980.0,22.76,5040.0,424.751453
1,1981.0,71.45,70.0,541.947076
2,1982.0,57.74,7140.0,375.232176
3,1983.0,33.59,7155.0,374.878467
4,1984.0,100.71,1315.0,512.589219


In [91]:
# Create a line plot of the predicted salary values
best_fit_line = df_sales_predicted.hvplot.line(
    x = "Count of Games Released",
    y = "sales_predicted",
    color = "red"
)
best_fit_line

In [92]:
# Superpose the original data and the best fit line
sales_plot * best_fit_line

In [93]:
# Import relevant metrics - score, r2, mse, rmse, std - from Scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [94]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.016408250172634253.
The r2 is 0.016408250172634253.
The mean squared error is 172749.29993596865.
The root mean squared error is 415.6312066435443.
The standard deviation is 419.08364194461484.
