In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
import seaborn as sns
import plotly.express as px
from collections import Counter
import plotly.subplots as sp
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, classification_report, accuracy_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from plotly.subplots import make_subplots
from datetime import datetime
from dateutil import parser

In [2]:
df = pd.read_csv("SegaGames.csv")
df.head()

Unnamed: 0,meta_score,title,platform,date,user_score,link,esrb_rating,developers,genres
0,,Persona 3 Reload,XONE,"Feb 2, 2024",,/game/xbox-one/persona-3-reload,,['P-Studio'],"['Role-Playing', 'Japanese-Style']"
1,,Persona 5 Tactica,PC,"Nov 17, 2023",,/game/pc/persona-5-tactica,,['P-Studio'],"['Strategy', 'Turn-Based', 'Tactics']"
2,,Persona 5 Tactica,XONE,"Nov 17, 2023",,/game/xbox-one/persona-5-tactica,RP,['P-Studio'],"['Strategy', 'Turn-Based', 'Tactics']"
3,,Persona 5 Tactica,PS4,"Nov 17, 2023",,/game/playstation-4/persona-5-tactica,,['P-Studio'],"['Strategy', 'Turn-Based', 'Tactics']"
4,,Persona 5 Tactica,Switch,"Nov 17, 2023",,/game/switch/persona-5-tactica,RP,['P-Studio'],"['Strategy', 'Turn-Based', 'Tactics']"


In [3]:
missing_values = df.isna().sum()
print(missing_values)

meta_score     693
title            0
platform         0
date             0
user_score     552
link             0
esrb_rating    361
developers      10
genres           2
dtype: int64


In [4]:
values_to_delete = ["TBA", "Canceled", "TBA 2011", "2000", "1998", "1996", "2001", "2005", "1999", "1984"]
df = df[~df['date'].isin(values_to_delete)]
df['date'] = df['date'].str.replace("August 2001", "Aug 2, 2001")
df['date'] = df['date'].str.replace("February 1999", "Feb 1, 1999")
df['date'] = df['date'].str.replace("April 2009", "Apr 1, 2009")
df['date'] = df['date'].str.replace("February 2010", "Feb 1, 2010")
df['date'] = df['date'].str.replace("November 1998", "Nov 1, 1998")
df['date'] = df['date'].str.replace("November 1997", "Nov 1, 1997")
df['date'] = df['date'].str.replace("September 2009", "Sep 1, 2009")

In [5]:
values_to_delete = ["2013)", "PSN)", "2011)", "Dreamcast Collection)", "Arcade)", "Live Arcade)", "2004)", "1995)"]
df = df[~df['platform'].isin(values_to_delete)]
df['platform'] = df['platform'].str.replace("Genesis)", "Genesis")
df['platform'] = df['platform'].str.replace("Master System)", "Master System")

In [6]:
df.head()

Unnamed: 0,meta_score,title,platform,date,user_score,link,esrb_rating,developers,genres
0,,Persona 3 Reload,XONE,"Feb 2, 2024",,/game/xbox-one/persona-3-reload,,['P-Studio'],"['Role-Playing', 'Japanese-Style']"
1,,Persona 5 Tactica,PC,"Nov 17, 2023",,/game/pc/persona-5-tactica,,['P-Studio'],"['Strategy', 'Turn-Based', 'Tactics']"
2,,Persona 5 Tactica,XONE,"Nov 17, 2023",,/game/xbox-one/persona-5-tactica,RP,['P-Studio'],"['Strategy', 'Turn-Based', 'Tactics']"
3,,Persona 5 Tactica,PS4,"Nov 17, 2023",,/game/playstation-4/persona-5-tactica,,['P-Studio'],"['Strategy', 'Turn-Based', 'Tactics']"
4,,Persona 5 Tactica,Switch,"Nov 17, 2023",,/game/switch/persona-5-tactica,RP,['P-Studio'],"['Strategy', 'Turn-Based', 'Tactics']"


In [7]:
# Handling missing values
df.dropna(subset=['title', 'platform', 'date', 'esrb_rating','developers','genres'], inplace=True)
df['user_score'].fillna(df['user_score'].mean(), inplace=True)
df['meta_score'].fillna(df['meta_score'].mean(), inplace=True)

# Data type conversion
df['date'] = pd.to_datetime(df['date'])
df['release_year'] = df['date'].dt.year

platform_to_numeric = {platform: index + 1 for index, platform in enumerate(df['platform'].unique())}
df['platform_num'] = df['platform'].map(platform_to_numeric)

esrb_to_numeric = {esrb_rating: index + 1 for index, esrb_rating in enumerate(df['esrb_rating'].unique())}
df['esrb_rating_num'] = df['esrb_rating'].map(esrb_to_numeric)

In [8]:
df["rescaled_meta_score"] = df["meta_score"] / 10

In [9]:
df.head()

Unnamed: 0,meta_score,title,platform,date,user_score,link,esrb_rating,developers,genres,release_year,platform_num,esrb_rating_num,rescaled_meta_score
2,72.219512,Persona 5 Tactica,XONE,2023-11-17,7.38083,/game/xbox-one/persona-5-tactica,RP,['P-Studio'],"['Strategy', 'Turn-Based', 'Tactics']",2023,1,1,7.221951
4,72.219512,Persona 5 Tactica,Switch,2023-11-17,7.38083,/game/switch/persona-5-tactica,RP,['P-Studio'],"['Strategy', 'Turn-Based', 'Tactics']",2023,2,1,7.221951
8,72.219512,Endless Dungeon,PC,2023-10-17,7.38083,/game/pc/endless-dungeon,T,['Amplitude Studios'],"['Strategy', 'Real-Time', 'Tactics']",2023,3,2,7.221951
10,72.219512,Endless Dungeon,PS4,2023-10-17,7.38083,/game/playstation-4/endless-dungeon,T,['Amplitude Studios'],"['Strategy', 'Real-Time', 'Tactics']",2023,4,2,7.221951
11,72.219512,Sonic Superstars,PC,2023-10-17,7.38083,/game/pc/sonic-superstars,E,"['Sega', ' Arzest']","['Action', 'Platformer', '2D']",2023,3,3,7.221951


In [10]:
df.describe()

Unnamed: 0,meta_score,date,user_score,release_year,platform_num,esrb_rating_num,rescaled_meta_score
count,1090.0,1090,1090.0,1090.0,1090.0,1090.0,1090.0
mean,72.219512,2011-07-01 05:02:31.926605568,7.38083,2010.972477,7.846789,3.280734,7.221951
min,26.0,1996-08-26 00:00:00,2.2,1996.0,1.0,1.0,2.6
25%,71.0,2007-06-05 18:00:00,7.2,2007.0,3.0,3.0,7.1
50%,72.219512,2010-10-22 12:00:00,7.38083,2010.0,7.0,3.0,7.221951
75%,78.0,2015-08-29 18:00:00,8.1,2015.0,11.0,4.0,7.8
max,97.0,2023-11-17 00:00:00,9.6,2023.0,20.0,7.0,9.7
std,10.165545,,1.010055,6.299181,5.61526,1.050644,1.016555


## Data Cleaning

In [11]:
columns_to_handle = ['rescaled_meta_score', 'user_score']

# Create a Plotly box plot
fig = px.box(df, y=columns_to_handle, title='Box Plots of Columns with Potential Outliers')
fig.update_layout(xaxis_title='Columns', yaxis_title='Values')
fig.show()

In [12]:
df_filtered = df[(df['user_score'] > 7) & 
                 (df['rescaled_meta_score'] > 6) & 
                 (df['rescaled_meta_score'] < 9) & 
                 (df['user_score'] < 9.5)]

# Define features (X) and target variable (y)
X = df_filtered[['rescaled_meta_score', 'platform_num', 'esrb_rating_num']]
y = df_filtered['user_score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 0.3971811860957472


In [13]:
scatter_data = pd.DataFrame({'Actual User Scores': y_test, 'Predicted User Scores': y_pred})
fig = px.scatter(scatter_data, x='Actual User Scores', y='Predicted User Scores', title='Actual vs. Predicted User Scores')
fig.add_shape(type='line', x0=min(y_test), x1=max(y_test), y0=min(y_test), y1=max(y_test), line=dict(color='red', dash='dash'))
fig.update_layout(xaxis_title='Actual User Scores', yaxis_title='Predicted User Scores')
fig.show()

In [14]:
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 0.3779869897811944


In [15]:
# Create a Plotly scatter plot to visualize predictions
scatter_data = pd.DataFrame({'Actual User Scores': y_test, 'Predicted User Scores': y_pred})
fig = px.scatter(scatter_data, x='Actual User Scores', y='Predicted User Scores', title='Actual vs. Predicted User Scores (Gradient Boosting)')
fig.add_shape(type='line', x0=min(y_test), x1=max(y_test), y0=min(y_test), y1=max(y_test), line=dict(color='red', dash='dash'))
fig.update_layout(xaxis_title='Actual User Scores', yaxis_title='Predicted User Scores')
fig.show()

## Feature Importance

In [16]:
X = df_filtered[['user_score', 'platform_num', 'esrb_rating_num']]

# Binarize the meta_score column into high/low classes
# You can define your own threshold for what constitutes a high score
threshold = 7  # For example, considering scores above 70 as high
y = (df_filtered['rescaled_meta_score'] > threshold).astype(int)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training (Logistic Regression)
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Feature importance (coefficient magnitudes in this case)
feature_importance = model.coef_[0]

# Create a DataFrame for feature names and their importances
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Create an interactive bar plot using Plotly Express
fig = px.bar(feature_importance_df, x='Importance', y='Feature', orientation='h',
             labels={'Importance': 'Feature Importance', 'Feature': 'Features'},
             title='Feature Importance Analysis using Logistic Regression',
             template='plotly')

fig.show()


In [17]:
# Perform one-hot encoding on genres column
encoded_genres = df['genres'].str.join('|').str.get_dummies()

# Concatenate encoded genres with the original DataFrame
df_encoded = pd.concat([df, encoded_genres], axis=1)

In [18]:
df_encoded.head()

Unnamed: 0,meta_score,title,platform,date,user_score,link,esrb_rating,developers,genres,release_year,...,p,r,s,t,u,v,w,x,y,z
2,72.219512,Persona 5 Tactica,XONE,2023-11-17,7.38083,/game/xbox-one/persona-5-tactica,RP,['P-Studio'],"['Strategy', 'Turn-Based', 'Tactics']",2023,...,0,1,1,1,1,0,0,0,1,0
4,72.219512,Persona 5 Tactica,Switch,2023-11-17,7.38083,/game/switch/persona-5-tactica,RP,['P-Studio'],"['Strategy', 'Turn-Based', 'Tactics']",2023,...,0,1,1,1,1,0,0,0,1,0
8,72.219512,Endless Dungeon,PC,2023-10-17,7.38083,/game/pc/endless-dungeon,T,['Amplitude Studios'],"['Strategy', 'Real-Time', 'Tactics']",2023,...,0,1,1,1,0,0,0,0,1,0
10,72.219512,Endless Dungeon,PS4,2023-10-17,7.38083,/game/playstation-4/endless-dungeon,T,['Amplitude Studios'],"['Strategy', 'Real-Time', 'Tactics']",2023,...,0,1,1,1,0,0,0,0,1,0
11,72.219512,Sonic Superstars,PC,2023-10-17,7.38083,/game/pc/sonic-superstars,E,"['Sega', ' Arzest']","['Action', 'Platformer', '2D']",2023,...,0,1,0,1,0,0,0,0,0,0


## Random Forrest & Decision Tree

In [19]:
# Split the dataset into features (X) and target (y)
X = df_encoded[["rescaled_meta_score","user_score","platform_num","release_year","a", "b", "c", "d", "e", "f", "g", "h", "i", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "x", "y", "z"]]
y = df_encoded['esrb_rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest (RF) Classifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)

# Decision Tree (DT) Classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)

Random Forest Accuracy: 0.7385321100917431
Decision Tree Accuracy: 0.7018348623853211


## K-Means

In [20]:
# Select the features for clustering
features = ["rescaled_meta_score", "user_score", "platform_num", "release_year", "a", "b", "c", "d", "e", "f", "g", "h", "i", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "x", "y", "z"]

# Prepare the data for clustering
X = df_encoded[features].copy()

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform clustering using K-Means
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)

# Get the cluster labels for each data point
labels = kmeans.labels_

# Add the cluster labels to the original dataset
df_encoded['cluster'] = labels





In [21]:
# Print the cluster labels
print("Cluster Labels:")
print(df_encoded['cluster'].value_counts())

# Print sample games from each cluster
print("\nSample Games from Each Cluster:")
for cluster_id in range(5):  # Assuming 5 clusters
    cluster_games = df_encoded[df_encoded['cluster'] == cluster_id]
    sample_games = cluster_games.sample(n=5)  # Print 5 sample games from each cluster
    print(f"\nCluster {cluster_id}:")
    for _, game in sample_games.iterrows():
        print(game['title'])

Cluster Labels:
cluster
4    342
3    331
1    282
2     78
0     57
Name: count, dtype: int64

Sample Games from Each Cluster:

Cluster 0:
Mystery Dungeon: Shiren the Wanderer
ESPN Major League Baseball
ESPN NHL 2K5
ESPN NHL Hockey
Daytona USA Deluxe

Cluster 1:
Sega Ages: Sonic the Hedgehog
Hell Yeah! Virtual Rabbit Missions
Sega Ages: Shinobi
Sonic Boom: Rise of Lyric
Golden Axe

Cluster 2:
Ghost Squad
Hatsune Miku: Project Diva Future Tone - Future Sound
The House of the Dead: Overkill
Virtua Fighter 2
Mercs

Cluster 3:
Conduit 2
Aliens: Colonial Marines - Stasis Interrupted
Aliens: Colonial Marines
Sonic Shuffle
After Burner: Black Falcon

Cluster 4:
Shining Force
Medieval II: Total War - Definitive Edition
Puyo Pop Fever
Valkyria Chronicles 4: Complete Edition
Let's TAP


## Time Series Analysis

In [22]:
# Load the dataset with date column
df['date'] = pd.to_datetime(df['date'])  # Convert date column to datetime format
df.set_index('date', inplace=True)  # Set date column as the index

# Create subplots
fig = sp.make_subplots(rows=2, cols=1, shared_xaxes=True, subplot_titles=('Meta Score', 'User Score'))

# Add traces for Meta Score
fig.add_trace(go.Scatter(x=df.index, y=df['meta_score'], name='Meta Score'), row=1, col=1)

# Add traces for User Score
fig.add_trace(go.Scatter(x=df.index, y=df['user_score'], name='User Score'), row=2, col=1)

# Update layout
fig.update_layout(height=600, width=800, title_text='Time Series: Meta Score vs User Score')
fig.update_xaxes(title_text='Date', row=2, col=1)
fig.update_yaxes(title_text='Meta Score', row=1, col=1)
fig.update_yaxes(title_text='User Score', row=2, col=1)

# Show the plot
fig.show()


# Calculate rolling mean and standard deviation
window = 12  # Rolling window size
df['rolling_mean'] = df['meta_score'].rolling(window=window).mean()
df['rolling_std'] = df['meta_score'].rolling(window=window).std()

# Plotting rolling statistics
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df['meta_score'], name='Meta Score'))
fig.add_trace(go.Scatter(x=df.index, y=df['rolling_mean'], name='Rolling Mean'))
fig.add_trace(go.Scatter(x=df.index, y=df['rolling_std'], name='Rolling Std'))
fig.update_layout(title='Rolling Mean and Standard Deviation',
                  xaxis_title='Date', yaxis_title='Score')
fig.show()

# Linear regression for trend analysis
X = pd.to_numeric(df.index).values.reshape(-1, 1)
y = df['meta_score'].values
regressor = LinearRegression()
regressor.fit(X, y)
trend = regressor.predict(X)

# Plotting trend line
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df['meta_score'], name='Meta Score'))
fig.add_trace(go.Scatter(x=df.index, y=trend, name='Trend'))
fig.update_layout(title='Trend Analysis', xaxis_title='Date', yaxis_title='Score')
fig.show()

## Stacked

In [23]:
# Filter rows based on your criteria
df_filtered = df[(df['user_score'] > 7) & 
                 (df['rescaled_meta_score'] > 6) & 
                 (df['rescaled_meta_score'] < 9) & 
                 (df['user_score'] < 9.5)]

# Define features (X) and target variable (y)
X = df_filtered[['rescaled_meta_score', 'platform_num', 'esrb_rating_num']]
y = df_filtered['user_score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize base models
linear_model = LinearRegression()
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
gradient_boosting_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train base models
linear_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)
gradient_boosting_model.fit(X_train, y_train)

# Generate predictions from base models
linear_preds = linear_model.predict(X_test)
rf_preds = random_forest_model.predict(X_test)
gb_preds = gradient_boosting_model.predict(X_test)

# Combine predictions using a simple average (you can experiment with different combining strategies)
stacked_preds = (linear_preds + rf_preds + gb_preds) / 3

# Calculate RMSE for stacked predictions
rmse = mean_squared_error(y_test, stacked_preds, squared=False)
print(f"Root Mean Squared Error (Stacked): {rmse}")


Root Mean Squared Error (Stacked): 0.3767772490656119
