In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [6]:
games_df = pd.read_csv('vgsales.csv')

print("Games Statistics:")
print(games_df.describe())

Games Statistics:
               Rank          Year      NA_Sales      EU_Sales      JP_Sales  \
count  16598.000000  16327.000000  16598.000000  16598.000000  16598.000000   
mean    8300.605254   2006.406443      0.264667      0.146652      0.077782   
std     4791.853933      5.828981      0.816683      0.505351      0.309291   
min        1.000000   1980.000000      0.000000      0.000000      0.000000   
25%     4151.250000   2003.000000      0.000000      0.000000      0.000000   
50%     8300.500000   2007.000000      0.080000      0.020000      0.000000   
75%    12449.750000   2010.000000      0.240000      0.110000      0.040000   
max    16600.000000   2020.000000     41.490000     29.020000     10.220000   

        Other_Sales  Global_Sales  
count  16598.000000  16598.000000  
mean       0.048063      0.537441  
std        0.188588      1.555028  
min        0.000000      0.010000  
25%        0.000000      0.060000  
50%        0.010000      0.170000  
75%        0.04000

In [9]:
# Fill missing values
games_df.fillna({
    'Year': games_df['Year'].median(),
    'Platform': 'Unknown',
    'Genre': 'Unknown',
    'Publisher': 'Unknown'
}, inplace=True)

In [10]:
# Define numeric and categorical features
numeric_features = ['Year', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']
categorical_features = ['Platform', 'Genre', 'Publisher']

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Apply transformations
transformed_features = preprocessor.fit_transform(games_df)

In [11]:
similarity_matrix = cosine_similarity(transformed_features)

In [12]:
def get_top_3_similar_games(similarity_matrix, games_df):
    similar_games = []
    for idx, game in games_df.iterrows():
        similar_indices = similarity_matrix[idx].argsort()[-4:-1][::-1]
        similar_games.append({
            'Game': game['Name'],
            'Similar1': games_df.iloc[similar_indices[0]]['Name'],
            'Score1': similarity_matrix[idx][similar_indices[0]],
            'Similar2': games_df.iloc[similar_indices[1]]['Name'],
            'Score2': similarity_matrix[idx][similar_indices[1]],
            'Similar3': games_df.iloc[similar_indices[2]]['Name'],
            'Score3': similarity_matrix[idx][similar_indices[2]],
        })
    return similar_games

In [13]:
similar_games = get_top_3_similar_games(similarity_matrix, games_df)

In [14]:
similar_games_df = pd.DataFrame(similar_games)
similar_games_df.to_csv('similar_games.csv', index=False)