# 내용 기반 필터링(Content-based Filtering)
- description, genre, tags, theme 등을 사용하여 유사한 게임을 추천한다.
- TF-IDF, Word2Vec, BERT와 같은 텍스트 임베딩을 사용하여 게임 설명을 벡터화하고 유사도를 계산한다.

## 데이터 준비 및 전처리

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.decomposition import TruncatedSVD
from annoy import AnnoyIndex
import numpy as np
import os

print(os.getcwd())

/aiffel/aiffel/0001


### 데이터 로드

In [3]:
data = pd.read_csv('/aiffel/aiffel/0001/modified_data_02.csv')
data.head()

Unnamed: 0,id,slug,name,description,released,status,tba,background_image,website,rating,...,tags,mode,developers,requirements,added_status_yet,added_status_owned,added_status_beaten,added_status_toplay,added_status_dropped,added_status_playing
0,741344,peace-angel,Peace Angel,２０２０年度１年生特進クラス 中村 桃香さんの作品です。天使を操作し、悪魔から死者を守りつつ...,2022-02-14,Released,False,https://media.rawg.io/media/screenshots/415/41...,No Website,0.0,...,No Tag,Mode-less game,神戸電子ゲームソフト分野,{},0,0,0,0,0,0
1,374441,brawl-planet,Brawl Planet,Eres un comandante al mando de la nave inteles...,2019-09-09,Released,False,https://media.rawg.io/media/screenshots/bd6/bd...,No Website,0.0,...,Singleplayer||Space||shotter,Singleplayer,AlexisBot,{},0,0,0,0,0,0
2,97470,obelus-arcade-boss-rush,OBELUS - Arcade Boss Rush,"In OBELUS, a bold robot battles three gargantu...",2018-05-22,Released,False,https://media.rawg.io/media/screenshots/736/73...,No Website,0.0,...,2D||Sci-fi||Pixel Graphics||Destruction||Monst...,Boss Rush,3xBlast||BlauwPrint,{},0,0,0,0,0,0
3,306287,pimple-popper-lite,Pimple Popper Lite,"Hello, you! We know you're itching for some fi...",2009-10-12,Released,False,https://media.rawg.io/media/screenshots/be3/be...,http://www.roomcandygames.com,0.0,...,friends||fun||scratch,Mode-less game,Room Candy Games,"{'minimum': 'iPad 2 Wifi, iPad 2 3G, iPhone 4S...",0,0,0,0,0,0
4,176964,square-square,SQUARE SQUARE,Left/right arrows - moveUp - restartClick on t...,2016-04-07,Released,False,https://media.rawg.io/media/screenshots/f26/f2...,No Website,0.0,...,2D||Score Attack||Black and White,Mode-less game,Dmitry Degtyarev,{},0,0,0,0,0,0


### 필요한 컬럼 선택

In [4]:
df = data[['id', 'name', 'description', 'genre', 'tags', 'theme']]
df.head()

Unnamed: 0,id,name,description,genre,tags,theme
0,741344,Peace Angel,２０２０年度１年生特進クラス 中村 桃香さんの作品です。天使を操作し、悪魔から死者を守りつつ...,Shooter,No Tag,Themeless
1,374441,Brawl Planet,Eres un comandante al mando de la nave inteles...,Shooter,Singleplayer||Space||shotter,Themeless
2,97470,OBELUS - Arcade Boss Rush,"In OBELUS, a bold robot battles three gargantu...",Action||Platformer||Shooter,2D||Sci-fi||Pixel Graphics||Destruction||Monst...,Sci-Fi
3,306287,Pimple Popper Lite,"Hello, you! We know you're itching for some fi...",Action||Arcade||Casual,friends||fun||scratch,Themeless
4,176964,SQUARE SQUARE,Left/right arrows - moveUp - restartClick on t...,Action,2D||Score Attack||Black and White,Themeless


### 내용 기반 필터링을 위한 텍스트 컬럼 생성

In [5]:
df['content'] = (df['description'] + ' ' + 
                (df['genre'].str.replace('||', ' ', regex=True) + ' ') * 6 + 
                 df['tags'].str.replace('||', ' ', regex=True) + ' ' + 
                (df['theme'].str.replace('||', ' ', regex=True) + ' ') * 4)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['content'] = (df['description'] + ' ' +


Unnamed: 0,id,name,description,genre,tags,theme,content
0,741344,Peace Angel,２０２０年度１年生特進クラス 中村 桃香さんの作品です。天使を操作し、悪魔から死者を守りつつ...,Shooter,No Tag,Themeless,２０２０年度１年生特進クラス 中村 桃香さんの作品です。天使を操作し、悪魔から死者を守りつつ...
1,374441,Brawl Planet,Eres un comandante al mando de la nave inteles...,Shooter,Singleplayer||Space||shotter,Themeless,Eres un comandante al mando de la nave inteles...
2,97470,OBELUS - Arcade Boss Rush,"In OBELUS, a bold robot battles three gargantu...",Action||Platformer||Shooter,2D||Sci-fi||Pixel Graphics||Destruction||Monst...,Sci-Fi,"In OBELUS, a bold robot battles three gargantu..."
3,306287,Pimple Popper Lite,"Hello, you! We know you're itching for some fi...",Action||Arcade||Casual,friends||fun||scratch,Themeless,"Hello, you! We know you're itching for some fi..."
4,176964,SQUARE SQUARE,Left/right arrows - moveUp - restartClick on t...,Action,2D||Score Attack||Black and White,Themeless,Left/right arrows - moveUp - restartClick on t...


## 텍스트 벡터화

### TF-IDF 벡터라이저 초기화

In [6]:
# tfidf = TfidfVectorizer(stop_words='english', max_features=10000)  # 최대 피처 수 제한
tfidf = TfidfVectorizer(stop_words='english')
tfidf

TfidfVectorizer(stop_words='english')

### content 컬럼을 TF-IDF 행렬로 변환

In [8]:
tfidf_matrix = tfidf.fit_transform(df['content'])
tfidf_matrix

<529715x985788 sparse matrix of type '<class 'numpy.float64'>'
	with 23053848 stored elements in Compressed Sparse Row format>

In [18]:
tfidf_matrix.shape

(529715, 985788)

### 차원 축소 기법

In [9]:
# Truncated SVD를 사용하여 차원 축소
svd = TruncatedSVD(n_components=100)
tfidf_matrix_reduced = svd.fit_transform(tfidf_matrix)

# 차원 축소된 TF-IDF 행렬의 크기 확인
print(tfidf_matrix_reduced.shape)

(529715, 100)


## 유사도 계산

### Annoy를 사용한 유사도 검색

In [10]:
# Annoy 인덱스 생성
annoy_index = AnnoyIndex(100, 'angular')

# Annoy 인덱스에 아이템 추가
for i in range(tfidf_matrix_reduced.shape[0]):
    annoy_index.add_item(i, tfidf_matrix_reduced[i])

# Annoy 인덱스 빌드
annoy_index.build(400)  # 트리 개수

True

## 특정 게임의 인덱스 가져오기

In [11]:
def get_similar_games(game_title, num_recommendations=10):
    if game_title not in df['name'].values:
        return "Game not found in dataset."
    
    # 특정 게임의 인덱스 가져오기
    idx = df.index[df['name'] == game_title].tolist()[0]
    
    # 해당 게임과 유사한 게임 인덱스 및 거리 계산
    similar_games, distances = annoy_index.get_nns_by_item(idx, num_recommendations + 1, include_distances=True)
    
    # 유사한 게임 인덱스 추출 (자기 자신 제외)
    similar_games = similar_games[1:]
    distances = distances[1:]
    
    # 유사한 게임 정보 추출
    similar_games_info = df[['name', 'genre', 'theme']].iloc[similar_games]
    similar_games_info['distance'] = distances
    
    return similar_games_info

In [12]:
df[df['name'] == 'Pimple Popper Lite'][['name', 'genre', 'theme']]

Unnamed: 0,name,genre,theme
3,Pimple Popper Lite,Action||Arcade||Casual,Themeless


In [13]:
get_similar_games('Pimple Popper Lite', 10)

Unnamed: 0,name,genre,theme,distance
429775,Pimple Popper,Action||Arcade||Casual||Simulation,Themeless,0.144138
268374,Ragdoll Blaster 3,Physics||Puzzle||Simulation,Themeless,0.698746
51439,iQuarters,Arcade||Physics||Sports,Themeless,0.711226
241514,Sweet Candies 2 - Huge Match 3,Board||Puzzle,Themeless,0.733473
189524,Sweet Candies 2: Match 3 Games,Board||Puzzle,Action||Comedy,0.733474
219,Zero Fun - Number Action,Arcade||Casual||Puzzle,Themeless,0.736202
394602,SIKE! Bank Shot Basketball,Arcade||Casual||Sports,Themeless,0.73761
20929,Angry Birds Match,Party||Puzzle,Themeless,0.739151
408818,"Kids & Play Cars, Trucks, Emergency & Construc...",Family,Themeless,0.741633
493158,Ski Girl Superstar,Adventure||Party||RPG,Music||Sci-Fi,0.743374


In [14]:
df[df['name'] == 'OBELUS - Arcade Boss Rush'][['name', 'genre', 'theme']]

Unnamed: 0,name,genre,theme
2,OBELUS - Arcade Boss Rush,Action||Platformer||Shooter,Sci-Fi


In [15]:
get_similar_games('OBELUS - Arcade Boss Rush', 10)

Unnamed: 0,name,genre,theme,distance
63169,Metro Boy in Development,Action||Adventure||Horror,Themeless,0.782686
71418,Bombjack,Platformer,Themeless,0.790276
406625,DeepMine,Metroidvania||Platformer||Shooter,Themeless,0.810871
513526,Back to Nature (Rajcsányi László (WLS)),Action,Themeless,0.837814
423469,Core & Crust,Platformer,Horror,0.840908
298844,Tiki Quest,Action||Platformer,Sandbox,0.846832
520995,Flappy Beer (L_C_S),Platformer,Themeless,0.846871
274239,Johnnie's Space Adventure,Platformer,Themeless,0.846998
150244,The Fantastic Gobling,Misc,Themeless,0.847871
273410,Fake Lethal League,Fighting,Themeless,0.84828


In [16]:
df[df['name'] == 'Monster Hunter 4'][['name', 'genre', 'theme']]

Unnamed: 0,name,genre,theme
529711,Monster Hunter 4,Action||RPG,Themeless


In [17]:
get_similar_games('Monster Hunter 4', 10)

Unnamed: 0,name,genre,theme,distance
90,Doom Souls,Misc,Themeless,0.0
338,The Power of Northeser Ocean,Misc,Themeless,0.0
347,Pferd & Pony: Lass uns reiten 2,Sports,Themeless,0.0
517,Universal Combat: A World Apart,Action||Shooter||Simulation,Themeless,0.0
663,World War II Panzer Claws 2,Strategy,Themeless,0.0
683,Sheech,Misc,Themeless,0.0
744,Elemental Robotics - WebGL,Platformer,Themeless,0.0
762,l'Abbaye des Morts,Action||Arcade,Themeless,0.0
977,AFROMAN wonder brother bounce,Misc,Themeless,0.0
1021,Accelerate (Emilio01),Misc,Themeless,0.0


In [None]:
# 전체 데이터셋의 평균 유사도 거리 계산 함수
def calculate_average_similarity(num_samples=100, num_recommendations=10):
    sample_games = np.random.choice(df['name'], num_samples, replace=False)
    total_distance = 0
    total_recommendations = 0
    
    for game in sample_games:
        recommendations = get_similar_games(game, num_recommendations)
        if isinstance(recommendations, pd.DataFrame):
            total_distance += recommendations['distance'].sum()
            total_recommendations += len(recommendations)
    
    average_distance = total_distance / total_recommendations
    return average_distance

# 평균 유사도 거리 계산 및 출력
average_similarity_distance = calculate_average_similarity(num_samples=100, num_recommendations=10)
print(f'Average Similarity Distance: {average_similarity_distance}')

### 해당 게임과 유사한 게임 인덱스 및 거리 계산

In [16]:
# 예: 'Brawl Planet', 'Pimple Popper Lite'
game_title = 'Monster Hunter 4'
idx = df.index[df['name'] == game_title].tolist()[0]

In [17]:
similar_games, distances = annoy_index.get_nns_by_item(idx, 10, include_distances=True)

# 유사한 게임 인덱스 추출 (자기 자신 제외)
similar_games = similar_games[1:]

# 유사한 게임 이름 반환
similar_game_names = df['name'].iloc[similar_games]
print(similar_game_names)

142            Extinguish the dinosaurs
205              Sudoku Ball: Detective
289              Sisyphus: Salmon Run !
548              Corel Wild Board Games
579                 Bicycle Board Games
652     Rock Paper Scissors Tic Tac Toe
787      Aliens: A Comic Book Adventure
909              High effort vexon pack
1309                         PlayerPush
Name: name, dtype: object
