In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
import os
print(os.getcwd())

/aiffel/aiffel/0002


In [25]:
# 데이터 로드
df = pd.read_csv('/aiffel/aiffel/0002/modified_data_03.csv')

# 결측값 처리
df.loc[df['id'] == 119609, 'slug'] = 'nan'
df.loc[df['id'] == 119609, 'name'] = 'NaN'
df.loc[df['id'] == 100122, 'slug'] = 'null'
df.loc[df['id'] == 100122, 'name'] = 'NULL'
df.loc[df['id'] == 468408, 'slug'] = 'none'
df.loc[df['id'] == 468408, 'name'] = 'None'

In [26]:
# 필요한 열만 선택
games_df = df[['id', 'name', 'genre', 'theme', 'tags', 'description']]

# 장르, 테마, 태그를 하나의 문자열로 결합
games_df['combined_features'] = (
     games_df['description'] + ' ' + 
    (games_df['genre'].str.replace(',', ' ') + ' ' ) * 6 + 
    (games_df['theme'].str.replace(',', ' ') + ' ' ) * 4 +  
     games_df['tags'].str.replace(',', ' ')
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_df['combined_features'] = (


In [27]:
games_df.head()

Unnamed: 0,id,name,genre,theme,tags,description,combined_features
0,741344,Peace Angel,Shooter,,,２０２０年度１年生特進クラス 中村 桃香さんの作品です。天使を操作し、悪魔から死者を守りつつ...,２０２０年度１年生特進クラス 中村 桃香さんの作品です。天使を操作し、悪魔から死者を守りつつ...
1,374441,Brawl Planet,Shooter,,"Singleplayer,Space,shotter",Eres un comandante al mando de la nave inteles...,Eres un comandante al mando de la nave inteles...
2,97470,OBELUS - Arcade Boss Rush,"Action,Platformer,Shooter",Sci-Fi,"2D,Sci-fi,Pixel Graphics,Destruction,Monsters,...","In OBELUS, a bold robot battles three gargantu...","In OBELUS, a bold robot battles three gargantu..."
3,306287,Pimple Popper Lite,"Action,Arcade,Casual",,"friends,fun,scratch","Hello, you! We know you're itching for some fi...","Hello, you! We know you're itching for some fi..."
4,176964,SQUARE SQUARE,Action,,"2D,Score Attack,Black and White",Left/right arrows - moveUp - restartClick on t...,Left/right arrows - moveUp - restartClick on t...


In [28]:
# TF-IDF 벡터화
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.8)
tfidf_matrix = tfidf.fit_transform(games_df['combined_features'])

# NearestNeighbors 모델 사용
nn = NearestNeighbors(metric='cosine', algorithm='brute')
nn.fit(tfidf_matrix)

# 게임 이름과 인덱스 매핑
indices = pd.Series(games_df.index, index=games_df['name']).drop_duplicates()

In [36]:
tfidf_matrix.shape

(550564, 1021555)

In [50]:
# 추천 함수
def recommend_games(title, nn_model, tfidf_matrix, indices, n_recommendations=10):
    if title not in indices:
        return "Game not found in dataset."
    
    idx = indices[title]
    query_vector = tfidf_matrix[idx]
    distances, indices = nn_model.kneighbors(query_vector, n_neighbors=n_recommendations + 1)
    
    sim_scores = list(zip(indices.flatten(), distances.flatten()))
    sim_scores = sorted(sim_scores, key=lambda x: x[1])
    game_indices = [i[0] for i in sim_scores[1:]]
    
    recommendations = games_df[['name', 'genre', 'theme']].iloc[game_indices]
    recommendations['distance'] = [score[1] for score in sim_scores[1:]]
    
    return recommendations

In [51]:
df[df['name'] == 'Pimple Popper Lite'][['name', 'genre', 'theme']]

Unnamed: 0,name,genre,theme
3,Pimple Popper Lite,"Action,Arcade,Casual",


In [52]:
recommend_games('Pimple Popper Lite', nn, tfidf_matrix, indices)

Unnamed: 0,name,genre,theme,distance
444634,Pimple Popper,"Action,Arcade,Casual,Simulation",,0.054911
134952,Pimple Popper Seasons,"Action,Arcade,Casual",,0.078371
527387,Pimple Popper Pixel,"Arcade,Casual,Simulation",,0.11491
515558,MEGA Pimple Popper,"Arcade,Casual,Party,Simulation",,0.203498
430898,Pimple Popper 2,"Arcade,Casual,Party,Simulation",,0.205545
371555,Infinite Zombie,"Action,Arcade,Casual",,0.345106
59883,Riot Runners,"Action,Arcade",,0.374503
492179,Piano City,Arcade,,0.389605
404249,Slime Flight,"Arcade,Casual,Racing",,0.413272
210719,Candy Flight,"Action,Arcade",,0.425032


In [53]:
# 평가 함수
def calculate_average_similarity(num_samples=100, num_recommendations=10):
    sample_games = np.random.choice(games_df['name'], num_samples, replace=False)
    total_distance = 0
    total_recommendations = 0
    
    for game in sample_games:
        recommendations = recommend_games(game, nn, tfidf_matrix, indices, num_recommendations)
        if isinstance(recommendations, pd.DataFrame):
            total_distance += recommendations['distance'].sum()
            total_recommendations += len(recommendations)
    
    average_distance = total_distance / total_recommendations
    return average_distance

In [54]:
average_similarity_distance = calculate_average_similarity(num_samples=100, num_recommendations=10)
print(f'Average Similarity Distance: {average_similarity_distance}')

Average Similarity Distance: 0.462356796475423


### 칼럼 추가

In [37]:
# 필요한 열만 선택
test_df = df[['id', 'name', 'genre', 'rating', 'released', 'theme', 'platforms', 'tags', 'developers', 'publishers']]
test_df = test_df.fillna('')

# 장르, 테마, 태그를 하나의 문자열로 결합
test_df['combined_features'] = (
    (test_df['genre'].str.replace(',', ' ') + ' ' ) * 6 + 
    (test_df['theme'].str.replace(',', ' ') + ' ' ) * 2 +  
     test_df['tags'].str.replace(',', ' ') +
     test_df['released'] + 
    (test_df['platforms'].str.replace(',', ' ') + ' ') * 4 +
    (test_df['developers'].str.replace(',', ' ') + ' ') * 3 +
    (test_df['publishers'].str.replace(',', ' ') + ' ') * 2 
)

In [49]:
test_df.head()

Unnamed: 0,id,name,genre,rating,released,theme,platforms,tags,developers,publishers,combined_features
0,741344,Peace Angel,Shooter,0.0,2022-02-14,,PC,,神戸電子ゲームソフト分野,,Shooter Shooter Shooter Shooter Shooter Shoote...
1,374441,Brawl Planet,Shooter,0.0,2019-09-09,,Web,"Singleplayer,Space,shotter",AlexisBot,,Shooter Shooter Shooter Shooter Shooter Shoote...
2,97470,OBELUS - Arcade Boss Rush,"Action,Platformer,Shooter",0.0,2018-05-22,Sci-Fi,PC,"2D,Sci-fi,Pixel Graphics,Destruction,Monsters,...","3xBlast,BlauwPrint",,Action Platformer Shooter Action Platformer Sh...
3,306287,Pimple Popper Lite,"Action,Arcade,Casual",0.0,2009-10-12,,iOS,"friends,fun,scratch",Room Candy Games,Room Candy Games,Action Arcade Casual Action Arcade Casual Acti...
4,176964,SQUARE SQUARE,Action,0.0,2016-04-07,,"PC,MacOS,Linux,Web","2D,Score Attack,Black and White",Dmitry Degtyarev,,Action Action Action Action Action Action ...


In [38]:
# TF-IDF 벡터화
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(test_df['combined_features'])

# NearestNeighbors 모델 사용
nn = NearestNeighbors(metric='cosine', algorithm='brute')
nn.fit(tfidf_matrix)

# 게임 이름과 인덱스 매핑
indices = pd.Series(test_df.index, index=test_df['name']).drop_duplicates()

In [55]:
# 추천 함수
def recommend_games(title, nn_model, tfidf_matrix, indices, n_recommendations=10):
    if title not in indices:
        return "Game not found in dataset."
    
    idx = indices[title]
    query_vector = tfidf_matrix[idx]
    distances, indices = nn_model.kneighbors(query_vector, n_neighbors=n_recommendations + 1)
    
    sim_scores = list(zip(indices.flatten(), distances.flatten()))
    sim_scores = sorted(sim_scores, key=lambda x: x[1])
    game_indices = [i[0] for i in sim_scores[1:]]
    
    recommendations = test_df[['name', 'released', 'genre', 'theme', 'platforms', 'developers', 'publishers']].iloc[game_indices]
    recommendations['distance'] = [score[1] for score in sim_scores[1:]]
    
    return recommendations

#### 테스트

In [40]:
df[df['name'] == 'Pimple Popper Lite'][['name', 'released', 'genre', 'theme', 'platforms', 'developers', 'publishers']]

Unnamed: 0,name,released,genre,theme,platforms,developers,publishers
3,Pimple Popper Lite,2009-10-12,"Action,Arcade,Casual",,iOS,Room Candy Games,Room Candy Games


In [41]:
recommend_games('Pimple Popper Lite', nn, tfidf_matrix, indices)

Unnamed: 0,name,released,genre,theme,platforms,developers,publishers,distance
444634,Pimple Popper,2009-04-27,"Action,Arcade,Casual,Simulation",,iOS,Room Candy Games,Room Candy Games,0.054911
134952,Pimple Popper Seasons,2012-01-28,"Action,Arcade,Casual",,iOS,Room Candy Games,Room Candy Games,0.078371
527387,Pimple Popper Pixel,2013-11-14,"Arcade,Casual,Simulation",,iOS,Room Candy Games,Room Candy Games,0.11491
515558,MEGA Pimple Popper,2019-02-24,"Arcade,Casual,Party,Simulation",,iOS,Room Candy Games,Room Candy Games,0.203498
430898,Pimple Popper 2,2019-03-16,"Arcade,Casual,Party,Simulation",,iOS,Room Candy Games,Room Candy Games,0.205545
371555,Infinite Zombie,2018-02-24,"Action,Arcade,Casual",,iOS,Candy Soft,CANDY SOFT,0.345106
59883,Riot Runners,2014-02-13,"Action,Arcade",,iOS,Room 8,Room 8,0.374503
492179,Piano City,2014-07-17,Arcade,,iOS,Room 8,Room 8,0.389605
404249,Slime Flight,2017-09-16,"Arcade,Casual,Racing",,iOS,Candy Soft,CANDY SOFT,0.413272
210719,Candy Flight,2014-03-21,"Action,Arcade",,iOS,Candy Soft,CANDY SOFT,0.425032


In [42]:
df[df['name'] == 'League of Legends'][['name', 'released', 'genre', 'theme', 'platforms', 'developers', 'publishers']]

Unnamed: 0,name,released,genre,theme,platforms,developers,publishers
550345,League of Legends,2009-10-27,"Action,MOBA,RPG,Strategy",,"MacOS,PC",Riot Games,Riot Games


In [43]:
recommend_games('League of Legends', nn, tfidf_matrix, indices)

Unnamed: 0,name,released,genre,theme,platforms,developers,publishers,distance
334629,League of Legends: Wild Rift,2020-10-27,"Action,MOBA,Strategy",,"iOS,Android",Riot Games,Riot Games,0.106655
192997,Project F: Riot’s Action RPG,,"Action,RPG",,PC,Riot Games,,0.295102
258130,Teamfight Tactics,2019-06-18,Strategy,,"PC,MacOS,iOS,Android",Riot Games,Riot Games,0.355238
229250,Niloc,2020-03-09,"Action,MOBA,Strategy",,PC,Sollara Games,,0.408423
304250,FODA,2017-10-31,"MOBA,RPG,Strategy",Nature,PC,racascou,,0.423008
278507,Endless League,2018-10-23,"Action,MOBA,Shooter,Strategy",,Web,Aden Games,,0.423434
141352,World Of Wizards,,"MOBA,RPG",,Android,GG Games,,0.42353
509215,MEDLand Mayhem Windows + macOS,2023-01-25,"MOBA,RPG,Strategy",,"MacOS,PC",WulfshieldDesignStudios,,0.429313
76910,Legends of Runeterra,2020-04-28,"Card,Strategy",,"PC,iOS,Android",Riot Games,Riot Games,0.430512
412982,Spellheart,2016-04-29,"Action,Action,RPG,MOBA",,PC,spellheartgame,,0.448214


#### 평가

In [56]:
# 평가 함수
def calculate_average_similarity(num_samples=1000, num_recommendations=10):
    sample_games = np.random.choice(games_df['name'], num_samples, replace=False)
    total_distance = 0
    total_recommendations = 0
    
    for game in sample_games:
        recommendations = recommend_games(game, nn, tfidf_matrix, indices, num_recommendations)
        if isinstance(recommendations, pd.DataFrame):
            total_distance += recommendations['distance'].sum()
            total_recommendations += len(recommendations)
    
    average_distance = total_distance / total_recommendations
    return average_distance

In [58]:
average_similarity_distance = calculate_average_similarity(num_samples=1000, num_recommendations=10)
print(f'Average Similarity Distance: {average_similarity_distance}')

Average Similarity Distance: 0.4660632730738703
