"""-IMPORTING NECESSARY LIBRARIES-"""

In [None]:
# Basic libraries
import pandas as pd 
import numpy as np

# Visualization libraries 
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px

# Machine learning libraries 
from sklearn.preprocessing import StandardScaler,LabelEncoder

'''Reading dataset'''

In [None]:
df=pd.read_csv(r"D:\portfolio project gmaes sales and ratting\Video_Games.csv")
df.head(5)

'''Understanding the dataset'''
 

In [None]:
print(df.shape)
#print(df.info())
#print(df.describe())
print(df.isnull().sum())
print(df.columns)

'''How many games with critic and User score available?'''

In [None]:
game_with_both = df[['Critic_Score', 'User_Score']].dropna()
print(game_with_both.shape[0])

'''which publisher has the highest global sales?'''


In [None]:
# Checking any missing values in the columns 'Publisher' and 'Global_Sales' and removing them 
missing_value = df[["Publisher","Global_Sales"]].isnull().sum()
cleaned_values = df.dropna(subset=["Publisher"])

# Aggregation of global sales by publisher
global_sales = cleaned_values.groupby('Publisher')['Global_Sales'].sum().sort_values(ascending=False)

# Prepare top 10 as a DataFrame for seaborn
top10 = global_sales.head(10).reset_index()

# Top 10 publishers with the highest global sales visualization
plt.figure(figsize=(10,6))
sns.barplot(data=top10,
            x='Publisher',
            y='Global_Sales',
            palette='viridis')
plt.grid(linestyle=':', linewidth=1)
plt.title('TOP 10 PUBLISHERS WITH THE HIGHEST GLOBAL SALES', fontsize=17)
plt.xlabel('Names of the publishing companies',fontsize=13)
plt.ylabel('Global sales by the publishing companies',fontsize=13)
plt.xticks(rotation=35, ha='right',size=12)
plt.tight_layout()
plt.show()
#missing_value
#cleaned_values
top10

'''Which platform have the higest global sales?'''


In [None]:
# Checking any null values
platform_global_sales=df[['Platform','Global_Sales']].isnull().sum()

# Aggrigation of the global sales by the platform
platform_sales=df.groupby('Platform',as_index=False)['Global_Sales'].sum().sort_values(by='Global_Sales',ascending=False)
top10platform=platform_sales.head(10)

# Top 10 platform with the highest global sales visualization
plt.Figure(figsize=(8,6))
sns.barplot(data=top10platform,
            x='Platform',
            y='Global_Sales',
            palette='bwr')
plt.grid(linestyle=':',linewidth=1)
plt.title('TOP 10 PLATFORMS WITH THE HIGHEST GLOBAL SALES',fontsize=17)
plt.xlabel('Top 10 platforms',fontsize=13)
plt.ylabel('Global sales',fontsize=13)
plt.tight_layout()
plt.show()

'''Which Genre have the highest global sales?'''


In [None]:
# Checking and removing any null values
genre_sales=df[['Genre','Global_Sales']].isnull().sum()
clean_genre_sales=df.dropna(subset=['Genre'])
clean_genre_sales['Genre'].isnull().sum()

# Aggrigation of the global sales by Genre
genre_global_sales=clean_genre_sales.groupby('Genre')['Global_Sales'].sum().sort_values(ascending=False)

# Top 10 Genre with the highest global sales
top10_genre=genre_global_sales.head(10).reset_index()

# visualization
fig=px.bar(top10_genre,
           x='Genre',
           y='Global_Sales',
           color='Genre',
           title='Top 10 Genre with the highest global sales',
           labels={'Genre':'Game genre','Global sales':'Global sales in thousand units'})
fig.show()

'''Which feature correlates most with the high user score?'''


In [None]:
# Converting 'User_Score' to numeric, forcing tbd to NaN
df['User_Score']=pd.to_numeric(df['User_Score'],errors='coerce')

# Initialzing LabelEncoder
le=LabelEncoder()

# loop through each column in the dataframe
for column in df.columns:
    # Check if the column is of object type
    if df[column].dtype=='object':
        # Converting object type columns to catagorical type 
        df[column]=df[column].astype('category')
        # Applying labelEncoder to the column
        df[column]=le.fit_transform(df[column])

# Calculate the correlation matrix
top10_genre=genre_global_sales.head(10).reset_index()

# Calculate the correlation matrix
correlation_matrix = df.corr()

# Visualizing the correlation matrix using a heatmap
plt.figure(figsize=(12,8))
sns.heatmap(correlation_matrix,
            annot=True,
            annot_kws={'size':10},
            cmap='coolwarm',
            fmt='.2f',
            linecolor='black',
            linewidths=0.5,)
plt.title('Correlation Matrix for All Features')
plt.show()

'''After seeing previous heat map selecting specific columns which do really relate to user score '''

In [None]:
# Selact only the specific columns
specific_columns=['Name', 'Platform', 'Genre', 'Publisher', 'Global_Sales', 'Critic_Score', 'User_Score', 'Developer', 'Rating']
# Adding specific columns to df 
df=df[specific_columns]

# Changing tbd values to NaN
df['User_Score']=pd.to_numeric(df['User_Score'],errors='coerce')

# Saving labelEncoder as le 
le=LabelEncoder()

# looping through each collumn
for col in df.columns:
    # Checking if the column has object values
    if df[col].dtype=='object':
        # Converting cbject type columns to categorical type
        df[col]=df[col].astype('category').cat.codes.replace(-1, np.nan)
        # Applying LabelEncoder to categorical type columns
        df[col]=le.fit_transform(df[col])

# Calculating the correlation matrix 
correlation_matrix=df.corr()

# Visualizing the correlation matrix using a heatmap
plt.figure(figsize=(8,6))
sns.heatmap(correlation_matrix,
            cmap='coolwarm',
            annot=True,
            annot_kws={'size':10},
            fmt='.2f',
            linewidths=0.5,
            linecolor='black')
plt.title('Correlation Matrix for selacted columns')
plt.show()

'''Examine how the vedio game sales change over the year . Identifing the period of significant growth or decline in sales'''

In [None]:
# Loading data once again
df=pd.read_csv(r"D:\portfolio project gmaes sales and ratting\Video_Games.csv")

# Checking and removing null values
sales_year_null=df[['Year_of_Release','Global_Sales']].isnull().sum()
clean_sales_year_null=df['Year_of_Release'].dropna().astype(int)

# checking the sales in years
sales_years=df.groupby('Year_of_Release')['Global_Sales'].sum().sort_index()

# visualizing
plt.figure(figsize=(12,5))
sales_years.plot(kind='line', marker='o')
plt.xlabel("Year of Release")
plt.ylabel("Global Sales")
plt.title("Global Video Game Sales Over Years")
plt.axvspan(2005, 2008, color='green', alpha=0.2, label='Growth')
plt.axvspan(2010, 2017, color='red', alpha=0.2, label='Decline')
plt.legend(loc='upper left')
plt.grid(True)
plt.show()

'''Compare the popularity of the genres or platform in the different regions'''

In [None]:
# Selecting important columns and doing aggrigation
pop_genre=df.groupby('Genre')[['NA_Sales','EU_Sales','JP_Sales','Other_Sales']].sum()

# visulization
pop_genre.plot(kind='bar',
               figsize=(11,5))
plt.grid(linestyle=":",linewidth=1)
plt.title("Compare the popularity of the genres or platform in the different regions")
plt.xlabel("Genres")
plt.ylabel("All kind of sales")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

'''Correlation between user score and critic score also explor closely how user opinions align with profesional critic'''

In [None]:
# Making a copy of our orignal dataframe (df) so we wont make any change in our orignal data
df_c=df.copy()

# Selecting and removing null values from important columns
sel_c=df_c[['User_Score','Critic_Score']].isnull().sum()
clean_sel_c=df_c[['User_Score','Critic_Score']].dropna()

# Converting object values to numerical like tbd to NaN
clean_sel_c['User_Score'] = pd.to_numeric(clean_sel_c['User_Score'], errors='coerce')

# Finding correlation between user score and critic score 
corr_c=clean_sel_c['User_Score'].corr(clean_sel_c['Critic_Score'])

# visulization
plt.figure(figsize=(15,6))
plt.scatter(x = clean_sel_c['User_Score'] + np.random.normal(0, 0.1, len(clean_sel_c)),
            y = clean_sel_c['Critic_Score'] + np.random.normal(0, 0.1, len(clean_sel_c)),
            alpha=0.5,
            color='g')
plt.xlabel('User Score')
plt.ylabel('Critic Score')
plt.title(f'User vs Critic Scores (Correlation: {corr_c:.2f})')
plt.grid(True)
plt.show()

                                                                        # optional

plt.figure(figsize=(14,5))
plt.hexbin(clean_sel_c['User_Score'],
           clean_sel_c['Critic_Score'],
           gridsize=25,
           cmap='Blues')
plt.colorbar(label='Count')

'''Machine learning model to predict future sales based on various features'''

In [None]:
# Importing machine learning libraries
from sklearn.preprocessing import LabelEncoder,StandardScaler,OneHotEncoder
from sklearn.metrics import confusion_matrix,classification_report,mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Making a copy of our orignal dataframe (df) so we wont make any change in our orignal data
df_d = df.copy()

# Selecting features and target
features=df_d[['Platform', 'Genre', 'Publisher', 'Critic_Score', 'User_Score','Rating','Year_of_Release','Developer']]
target=df_d['Global_Sales']

# Handling missing values
features = features.copy()
features['User_Score']=pd.to_numeric(features['User_Score'],errors='coerce')
features_c=features.dropna()
target_clean=df_d.loc[features_c.index,'Global_Sales']

# Separating Categorical and Numerical Columns
features_numerical=['User_Score','Critic_Score','Year_of_Release']
categorical_features=['Platform','Genre','Publisher','Rating','Developer']

# Using pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
categorical_transformer= Pipeline(steps=[
        ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
        ('oneencoder',OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, features_numerical),
    ('cat', categorical_transformer, categorical_features)
])

# Create the preprocessing and training pipeline
model=Pipeline(steps=[
    ('preprocessior',preprocessor),
    ('regresion', RandomForestRegressor(n_estimators=40, random_state=42))
])

# Split the data into training and testing sets
X_train ,X_test ,y_train ,y_test=train_test_split(features_c,
                                                  target_clean,
                                                  test_size=0.2,
                                                  random_state=42)

# Train the model
model.fit(X_train,y_train)

# Predict on the test set
y_pre=model.predict(X_test)

# Evaluate the model
mse=mean_squared_error(y_pre,y_test)
rmse=mse**0.5

# Printing result
print('mse:',mse)
print('rmse:',rmse)