# 내용 기반 필터링(Content-based Filtering)
- description, genre, tags, theme 등을 사용하여 유사한 게임을 추천한다.
- TF-IDF, Word2Vec, BERT와 같은 텍스트 임베딩을 사용하여 게임 설명을 벡터화하고 유사도를 계산한다.

## 데이터 준비 및 전처리

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.decomposition import TruncatedSVD

### 데이터 로드

In [2]:
data = pd.read_csv('/Users/AIFFELthon/final/data/modified_data_02.csv')
data.head()

Unnamed: 0,id,slug,name,description,released,status,tba,background_image,website,rating,...,tags,mode,developers,requirements,added_status_yet,added_status_owned,added_status_beaten,added_status_toplay,added_status_dropped,added_status_playing
0,741344,peace-angel,Peace Angel,２０２０年度１年生特進クラス 中村 桃香さんの作品です。天使を操作し、悪魔から死者を守りつつ...,2022-02-14,Released,False,https://media.rawg.io/media/screenshots/415/41...,No Website,0.0,...,No Tag,Mode-less game,神戸電子ゲームソフト分野,{},0,0,0,0,0,0
1,374441,brawl-planet,Brawl Planet,Eres un comandante al mando de la nave inteles...,2019-09-09,Released,False,https://media.rawg.io/media/screenshots/bd6/bd...,No Website,0.0,...,Singleplayer||Space||shotter,Singleplayer,AlexisBot,{},0,0,0,0,0,0
2,97470,obelus-arcade-boss-rush,OBELUS - Arcade Boss Rush,"In OBELUS, a bold robot battles three gargantu...",2018-05-22,Released,False,https://media.rawg.io/media/screenshots/736/73...,No Website,0.0,...,2D||Sci-fi||Pixel Graphics||Destruction||Monst...,Boss Rush,3xBlast||BlauwPrint,{},0,0,0,0,0,0
3,306287,pimple-popper-lite,Pimple Popper Lite,"Hello, you! We know you're itching for some fi...",2009-10-12,Released,False,https://media.rawg.io/media/screenshots/be3/be...,http://www.roomcandygames.com,0.0,...,friends||fun||scratch,Mode-less game,Room Candy Games,"{'minimum': 'iPad 2 Wifi, iPad 2 3G, iPhone 4S...",0,0,0,0,0,0
4,176964,square-square,SQUARE SQUARE,Left/right arrows - moveUp - restartClick on t...,2016-04-07,Released,False,https://media.rawg.io/media/screenshots/f26/f2...,No Website,0.0,...,2D||Score Attack||Black and White,Mode-less game,Dmitry Degtyarev,{},0,0,0,0,0,0


### 필요한 컬럼 선택

In [4]:
df = data[['id', 'name', 'description', 'genre', 'tags', 'theme']]
df.head()

Unnamed: 0,id,name,description,genre,tags,theme
0,741344,Peace Angel,２０２０年度１年生特進クラス 中村 桃香さんの作品です。天使を操作し、悪魔から死者を守りつつ...,Shooter,No Tag,Themeless
1,374441,Brawl Planet,Eres un comandante al mando de la nave inteles...,Shooter,Singleplayer||Space||shotter,Themeless
2,97470,OBELUS - Arcade Boss Rush,"In OBELUS, a bold robot battles three gargantu...",Action||Platformer||Shooter,2D||Sci-fi||Pixel Graphics||Destruction||Monst...,Sci-Fi
3,306287,Pimple Popper Lite,"Hello, you! We know you're itching for some fi...",Action||Arcade||Casual,friends||fun||scratch,Themeless
4,176964,SQUARE SQUARE,Left/right arrows - moveUp - restartClick on t...,Action,2D||Score Attack||Black and White,Themeless


### 내용 기반 필터링을 위한 텍스트 컬럼 생성

In [5]:
df['content'] = df['description'] + ' ' + df['genre'].str.replace('||', ' ') + ' ' + df['tags'].str.replace('||', ' ') + ' ' + df['theme'].str.replace('||', ' ')
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['content'] = df['description'] + ' ' + df['genre'].str.replace('||', ' ') + ' ' + df['tags'].str.replace('||', ' ') + ' ' + df['theme'].str.replace('||', ' ')


Unnamed: 0,id,name,description,genre,tags,theme,content
0,741344,Peace Angel,２０２０年度１年生特進クラス 中村 桃香さんの作品です。天使を操作し、悪魔から死者を守りつつ...,Shooter,No Tag,Themeless,２０２０年度１年生特進クラス 中村 桃香さんの作品です。天使を操作し、悪魔から死者を守りつつ...
1,374441,Brawl Planet,Eres un comandante al mando de la nave inteles...,Shooter,Singleplayer||Space||shotter,Themeless,Eres un comandante al mando de la nave inteles...
2,97470,OBELUS - Arcade Boss Rush,"In OBELUS, a bold robot battles three gargantu...",Action||Platformer||Shooter,2D||Sci-fi||Pixel Graphics||Destruction||Monst...,Sci-Fi,"In OBELUS, a bold robot battles three gargantu..."
3,306287,Pimple Popper Lite,"Hello, you! We know you're itching for some fi...",Action||Arcade||Casual,friends||fun||scratch,Themeless,"Hello, you! We know you're itching for some fi..."
4,176964,SQUARE SQUARE,Left/right arrows - moveUp - restartClick on t...,Action,2D||Score Attack||Black and White,Themeless,Left/right arrows - moveUp - restartClick on t...


## 텍스트 벡터화

### TF-IDF 벡터라이저 초기화

In [6]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf

### content 컬럼을 TF-IDF 행렬로 변환

In [7]:
tfidf_matrix = tfidf.fit_transform(df['content'])

# TF-IDF 행렬의 크기 확인
print(tfidf_matrix.shape)

(529715, 996297)


### 차원 축소 기법

In [8]:
# Truncated SVD를 사용하여 차원 축소
svd = TruncatedSVD(n_components=100)
tfidf_matrix_reduced = svd.fit_transform(tfidf_matrix)

# 차원 축소된 TF-IDF 행렬의 크기 확인
print(tfidf_matrix_reduced.shape)

(529715, 100)


## 유사도 계산

### 코사인 유사도 계산

In [9]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# 유사도 매트릭스 확인
print(cosine_sim.shape)

: 

## 추천 시스템 구축

### 게임 이름과 인덱스를 매핑하는 딕셔너리 생성

In [None]:
indices = pd.Series(df.index, index=df['name']).drop_duplicates()

### 추천 함수 정의

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # 입력된 게임의 인덱스 가져오기
    idx = indices[title]

    # 해당 게임과 모든 게임 간의 유사도 점수 가져오기
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 유사도 점수에 따라 게임들을 정렬
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 가장 유사한 10개의 게임 인덱스 가져오기
    sim_scores = sim_scores[1:11]

    # 유사한 게임 인덱스 추출
    game_indices = [i[0] for i in sim_scores]

    # 유사한 게임들의 이름 반환
    return df['name'].iloc[game_indices]

### 특정 게임과 유사한 게임 추천

In [None]:
print(get_recommendations('Brawl Planet'))