In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
import plotly.io as pio
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import ast

In [2]:
df = pd.read_csv("SegaGames.csv")
df.head()

Unnamed: 0,meta_score,title,platform,date,user_score,link,esrb_rating,developers,genres
0,,Persona 3 Reload,XONE,"Feb 2, 2024",,/game/xbox-one/persona-3-reload,,['P-Studio'],"['Role-Playing', 'Japanese-Style']"
1,,Persona 5 Tactica,PC,"Nov 17, 2023",,/game/pc/persona-5-tactica,,['P-Studio'],"['Strategy', 'Turn-Based', 'Tactics']"
2,,Persona 5 Tactica,XONE,"Nov 17, 2023",,/game/xbox-one/persona-5-tactica,RP,['P-Studio'],"['Strategy', 'Turn-Based', 'Tactics']"
3,,Persona 5 Tactica,PS4,"Nov 17, 2023",,/game/playstation-4/persona-5-tactica,,['P-Studio'],"['Strategy', 'Turn-Based', 'Tactics']"
4,,Persona 5 Tactica,Switch,"Nov 17, 2023",,/game/switch/persona-5-tactica,RP,['P-Studio'],"['Strategy', 'Turn-Based', 'Tactics']"


In [3]:
df_cleaned_genre = df.dropna(subset=['title', 'genres'])
df_cleaned_genre_title = df.drop_duplicates(subset='title', keep='first')

## Frequency Distribution of Genres with Plotly:

In [4]:
df_cleaned_genre_title = df_cleaned_genre_title.dropna(subset=['genres'])
df_cleaned_genre_title['genres'] = df_cleaned_genre_title['genres'].apply(lambda x: ast.literal_eval(x))
all_genres = [genre for sublist in df_cleaned_genre_title['genres'] for genre in sublist]
genre_counts = Counter(all_genres)
genre_distribution = pd.DataFrame(genre_counts.items(), columns=['Genres', 'Count'])
genre_distribution = genre_distribution.sort_values(by='Count', ascending=False)
fig = px.bar(genre_distribution, x='Genres', y='Count', title='Genres Distribution')
fig.show()

In [5]:
pio.write_image(fig, 'plot1.png')

![Plot](plot1.png)

## Genres vs. Meta Scores Analysis with Plotly:

In [6]:
df_cleaned_genre_meta = df_cleaned_genre_title.dropna(subset=['genres', 'meta_score'])
df_genre_expanded = df_cleaned_genre_meta.explode('genres')
df_genre_expanded['meta_score'] = pd.to_numeric(df_genre_expanded['meta_score'], errors='coerce')
fig2 = px.scatter(df_genre_expanded, x='genres', y='meta_score',labels={'genres': 'Genres', 'meta_score': 'Meta Score'}, title='Genres vs. Meta Scores')
fig2.show()

In [7]:
pio.write_image(fig2, 'plot2.png')

![Plot](plot2.png)

In [8]:
columns_to_clean = ['meta_score', 'platform', 'user_score']
df_cleaned_score = df.dropna(subset=columns_to_clean)
values_to_filter = ["2013)", "2011)", "1995)", "Live Arcade)"]
rows_to_drop = df_cleaned_score['platform'].isin(values_to_filter)
df_cleaned_plat_score = df_cleaned_score[~rows_to_drop]
df_cleaned_plat_score['meta_score'] = pd.to_numeric(df_cleaned_plat_score['meta_score'], errors='coerce')
df_cleaned_plat_score['user_score'] = pd.to_numeric(df_cleaned_plat_score['user_score'], errors='coerce')
highest_meta_score_platforms = df_cleaned_plat_score.groupby('platform')['meta_score'].mean().sort_values(ascending=False)
highest_user_score_platforms = df_cleaned_plat_score.groupby('platform')['user_score'].mean().sort_values(ascending=False)
fig_meta = px.box(df_cleaned_plat_score, x='platform', y='meta_score',labels={'platform': 'Platform', 'meta_score': 'Meta Score'}, title='Platform vs. Meta Scores')
fig_user = px.box(df_cleaned_plat_score, x='platform', y='user_score',labels={'platform': 'Platform', 'user_score': 'User Score'}, title='Platform vs. User Scores')
fig_meta.show()
fig_user.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [9]:
pio.write_image(fig_meta, 'plot3.png')
pio.write_image(fig_user, 'plot4.png')

![Plot](plot3.png)
![Plot](plot4.png)

In [10]:
overall_mean_meta_score = df_cleaned_plat_score['meta_score'].mean()
overall_mean_user_score = df_cleaned_plat_score['user_score'].mean()
highest_meta_score_platform = highest_meta_score_platforms.idxmax()
highest_meta_score = highest_meta_score_platforms.max()
highest_user_score_platform = highest_user_score_platforms.idxmax()
highest_user_score = highest_user_score_platforms.max()
print("Overall Mean Meta Score:", overall_mean_meta_score)
print("Overall Mean User Score:", overall_mean_user_score)
print("\nHighest-rated platform based on Meta Scores:")
print("Platform:", highest_meta_score_platform)
print("Mean Meta Score:", highest_meta_score)
print("\nHighest-rated platform based on User Scores:")
print("Platform:", highest_user_score_platform)
print("Mean User Score:", highest_user_score)

Overall Mean Meta Score: 72.56130790190736
Overall Mean User Score: 7.310490463215259

Highest-rated platform based on Meta Scores:
Platform: DC
Mean Meta Score: 80.38888888888889

Highest-rated platform based on User Scores:
Platform: VITA
Mean User Score: 7.955555555555555


## Release Date vs. Scores

In [11]:
valid_dates_df = df[df['date'].str.match(r'^[A-Za-z]+\s\d{1,2},\s\d{4}$')]
valid_dates_df['date'] = pd.to_datetime(valid_dates_df['date'], format='%b %d, %Y', errors='coerce')
valid_dates_df['numeric_date'] = valid_dates_df['date'].astype(int) // 10**9
fig3 = sp.make_subplots(rows=1, cols=2)
scatter_meta = go.Scatter(x=valid_dates_df['numeric_date'], y=valid_dates_df['meta_score'], mode='markers', name='Meta Scores')
scatter_user = go.Scatter(x=valid_dates_df['numeric_date'], y=valid_dates_df['user_score'], mode='markers', name='User Scores')
fig3.add_trace(scatter_meta, row=1, col=1)
fig3.add_trace(scatter_user, row=1, col=2)
fig3.update_layout(
    title='Released Date vs. Scores',
    showlegend=True,
    xaxis=dict(title='Date'),
    yaxis=dict(title='Scores'))
fig3.update_xaxes(title_text="Time", row=1, col=1)
fig3.update_yaxes(title_text="Scores", row=1, col=1)
fig3.update_xaxes(title_text="Time", row=1, col=2)
fig3.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [12]:
pio.write_image(fig3, 'plot5.png')

![Plot](plot5.png)

In [13]:
df['meta_score'] = pd.to_numeric(df['meta_score'], errors='coerce')
df['user_score'] = pd.to_numeric(df['user_score'], errors='coerce')
correlation = df['meta_score'].corr(df['user_score'])
print("Correlation between Meta Scores and User Scores:", correlation)

Correlation between Meta Scores and User Scores: 0.5274063386386616


The correlation coefficient of **0.53** between Meta Scores and User Scores indicates a **moderate positive correlation** between these two variables. This suggests that, in general, there is a tendency for games that receive higher Meta Scores to also receive higher User Scores, and vice versa. However, the correlation value of **0.53** suggests that the relationship is not extremely strong, and there might be instances where the opinions of critics and users diverge.

It's important to note that correlation does not imply causation. While there is a positive correlation, it doesn't necessarily mean that higher Meta Scores directly cause higher User Scores or vice versa. Various factors could contribute to this correlation, including the quality of the game, its appeal to different audiences, and the criteria used by critics and users for rating the games.

Overall, a correlation of **0.53** suggests a moderate level of agreement between critics and users regarding game ratings, but it's always a good idea to further analyze the data and consider other factors that might influence this relationship.

## ESRB Rating vs. Scores

In [14]:
df_filtered = df[~df['esrb_rating'].isin(['K-A', 'EC'])]
fig4 = sp.make_subplots(rows=1, cols=2)
box_meta = go.Box(x=df_filtered['esrb_rating'], y=df_filtered['meta_score'], name='Meta Scores')
box_user = go.Box(x=df_filtered['esrb_rating'], y=df_filtered['user_score'], name='User Scores')
fig4.add_trace(box_meta, row=1, col=1)
fig4.add_trace(box_user, row=1, col=2)
fig4.update_layout(title='ESRB Rating vs. Scores', showlegend=True)
fig4.update_xaxes(title_text="ESRB Rating", row=1, col=1)
fig4.update_yaxes(title_text="Scores", row=1, col=1)
fig4.update_xaxes(title_text="ESRB Rating", row=1, col=2)
fig4.show()

In [15]:
pio.write_image(fig4, 'plot6.png')

![Plot](plot6.png)

In [16]:
df['date'] = df['date'].replace('TBA', np.nan)
parsed_dates = []
for date_str in df['date']:
    try:
        parsed_date = pd.to_datetime(date_str, format='%b %d, %Y')
    except ValueError:
        parsed_date = pd.to_datetime('1900-01-01')
    parsed_dates.append(parsed_date)
df['date'] = parsed_dates
df = df.dropna(subset=['meta_score', 'user_score'])
selected_features = ['platform', 'genres', 'esrb_rating', 'date']
X_meta = df[selected_features]
y_meta = df['meta_score']
X_user = df[selected_features]
y_user = df['user_score']
X_meta_train, X_meta_test, y_meta_train, y_meta_test = train_test_split(X_meta, y_meta, test_size=0.2, random_state=42)
X_user_train, X_user_test, y_user_train, y_user_test = train_test_split(X_user, y_user, test_size=0.2, random_state=42)
numeric_features = []
categorical_features = ['platform', 'genres', 'esrb_rating']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
meta_model = Pipeline(steps=[('preprocessor', preprocessor),
                             ('model', RandomForestRegressor())])
meta_model.fit(X_meta_train, y_meta_train)
meta_predictions = meta_model.predict(X_meta_test)
meta_mse = mean_squared_error(y_meta_test, meta_predictions)
meta_r2 = r2_score(y_meta_test, meta_predictions)
print("Meta Score Model Evaluation:")
print("Mean Squared Error:", meta_mse)
print("R-squared:", meta_r2)
user_model = Pipeline(steps=[('preprocessor', preprocessor),
                             ('model', RandomForestRegressor())])
user_model.fit(X_user_train, y_user_train)
user_predictions = user_model.predict(X_user_test)
user_mse = mean_squared_error(y_user_test, user_predictions)
user_r2 = r2_score(y_user_test, user_predictions)
print("\nUser Score Model Evaluation:")
print("Mean Squared Error:", user_mse)
print("R-squared:", user_r2)


Meta Score Model Evaluation:
Mean Squared Error: 69.80624289447469
R-squared: 0.5194435531173446

User Score Model Evaluation:
Mean Squared Error: 1.469649823208713
R-squared: 0.07388263845312137


In [17]:
fig5 = sp.make_subplots(rows=1, cols=2)
def add_regression_line(fig, x_data, y_data, row, col, title):
    regression_line = LinearRegression()
    regression_line.fit(x_data.values.reshape(-1, 1), y_data)
    x_range = np.linspace(min(x_data), max(x_data), 100)
    y_range = regression_line.predict(x_range.reshape(-1, 1))
    scatter_trace = go.Scatter(x=x_data, y=y_data, mode='markers', name='Data')
    line_trace = go.Scatter(x=x_range, y=y_range, mode='lines', name='Regression Line')
    fig.add_trace(scatter_trace, row=row, col=col)
    fig.add_trace(line_trace, row=row, col=col)
    fig.update_xaxes(title_text=title, row=row, col=col)
    fig.update_yaxes(title_text='Predicted Score', row=row, col=col)
add_regression_line(fig5, y_meta_test, meta_predictions, row=1, col=1, title='Actual Meta Score')
add_regression_line(fig5, y_user_test, user_predictions, row=1, col=2, title='Actual User Score')
fig5.update_layout(title_text="Meta and User Score Predictions")
fig5.update_layout(showlegend=False)
fig5.show()


In [18]:
pio.write_image(fig5, 'plot7.png')

![Plot](plot7.png)

The results of the predictive models indicate interesting insights about their performance:

**For the Meta Score Model:**
- The Mean Squared Error (MSE) of approximately 67.55 suggests that, on average, the predictions deviate from the actual Meta Scores by this amount. Lower MSE values indicate better model performance.
- The R-squared value of about 0.53 indicates that around 53% of the variability in Meta Scores is explained by the model. While this suggests moderate predictability, there's room for improvement.

**For the User Score Model:**
- The Mean Squared Error (MSE) of about 1.47 suggests that the predictions, on average, deviate less from the actual User Scores compared to the Meta Score predictions.
- The R-squared value of approximately 0.07 implies that the model explains only a small portion (7%) of the variability in User Scores. This indicates that predicting User Scores might be more challenging due to various factors influencing user perceptions.

Both models show some degree of predictability, but further refinement could enhance their accuracy. It's important to consider domain knowledge and explore additional features that could contribute to better predictions. Keep in mind that achieving high prediction accuracy in fields like video game scores often involves accounting for complex factors that influence user and critic evaluations.
