In [11]:
import pandas as pd
import re
from tqdm import tqdm

import warnings

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
csv_path = '../data/final_igdb_pc.csv'
data = pd.read_csv(csv_path, index_col=0)
data

Unnamed: 0,id,name,genres,themes,keywords,similar_games
0,10553,Monster Truck Destruction,racing,action,,"[57296, 80382, 82090, 87622, 100800, 109292, 1..."
1,9323,The Guild 2: Pirates of the European Seas,role-playing-rpg simulator strategy,historical sandbox,pirates medieval management gamersgate naval-w...,"[18623, 23345, 26574, 31515, 35371, 36346, 365..."
2,30417,Steel Invaders,indie,action,,"[32902, 37419, 40524, 76263, 96217, 105233, 10..."
3,9236,Top Gun: Combat Zones,simulator,warfare,combat-flight-simulator flight-simulator fligh...,"[14758, 22621, 26431, 28562, 31480, 37289, 467..."
4,17790,The Treasures of Montezuma 3,puzzle adventure indie arcade,,steam steam-trading-cards digital-distribution...,"[25222, 25646, 26223, 27266, 55173, 55190, 560..."
...,...,...,...,...,...,...
18932,32495,Fantasy Kingdom Simulator,simulator strategy indie,,,"[17130, 17519, 31515, 33603, 33646, 36269, 515..."
18933,3165,Prince of Persia 3D,adventure,action stealth,parkour death mythology action-adventure egypt...,"[836, 10776, 26950, 28168, 28309, 30245, 36198..."
18934,100600,Nepenthe,role-playing-rpg adventure indie,action horror comedy,weird surreal hand-drawn,"[25311, 25646, 28309, 35994, 80916, 96217, 105..."
18935,48133,Mario's Time Machine,puzzle strategy,educational,retroachievements,"[236, 27792, 87056, 95776, 109129, 206719, 242..."


In [13]:
# similar_games 리스트로 바꾸기
from ast import literal_eval

# 문자열을 리스트로 변환
def str_to_list(x):
    try:
        if type(x) == str:
            return literal_eval(x)
        elif type(x) == list:
            return x
    except: #해당 값이 null값이거나 오류가 있을 때, None을 return 하기
        return None

data['similar_games'] = data['similar_games'].apply(lambda x: str_to_list(x))

In [14]:
# 결측치 공백으로 채우기
data['genres'] = data['genres'].fillna('')
data['themes'] = data['themes'].fillna('')
data['keywords'] = data['keywords'].fillna('')
data.head()

Unnamed: 0,id,name,genres,themes,keywords,similar_games
0,10553,Monster Truck Destruction,racing,action,,"[57296, 80382, 82090, 87622, 100800, 109292, 1..."
1,9323,The Guild 2: Pirates of the European Seas,role-playing-rpg simulator strategy,historical sandbox,pirates medieval management gamersgate naval-w...,"[18623, 23345, 26574, 31515, 35371, 36346, 365..."
2,30417,Steel Invaders,indie,action,,"[32902, 37419, 40524, 76263, 96217, 105233, 10..."
3,9236,Top Gun: Combat Zones,simulator,warfare,combat-flight-simulator flight-simulator fligh...,"[14758, 22621, 26431, 28562, 31480, 37289, 467..."
4,17790,The Treasures of Montezuma 3,puzzle adventure indie arcade,,steam steam-trading-cards digital-distribution...,"[25222, 25646, 26223, 27266, 55173, 55190, 560..."


### 새로운 컬럼 생성하기 - 병합

In [15]:
# 1. genres + themes 
data['genres_themes'] = data['genres'] + " " + data['themes']

# 2. genres + keywords
data['genres_keywords'] = data['genres'] + " " + data['keywords']

# 3. themes + keywords
data['themes_keywords'] = data['themes'] + " " + data['keywords']

# 4. themes + keywords
data['genres_themes_keywords'] = data['genres'] + " " + data['themes'] + " " + data['keywords']

data.head()

Unnamed: 0,id,name,genres,themes,keywords,similar_games,genres_themes,genres_keywords,themes_keywords,genres_themes_keywords
0,10553,Monster Truck Destruction,racing,action,,"[57296, 80382, 82090, 87622, 100800, 109292, 1...",racing action,racing,action,racing action
1,9323,The Guild 2: Pirates of the European Seas,role-playing-rpg simulator strategy,historical sandbox,pirates medieval management gamersgate naval-w...,"[18623, 23345, 26574, 31515, 35371, 36346, 365...",role-playing-rpg simulator strategy historical...,role-playing-rpg simulator strategy pirates me...,historical sandbox pirates medieval management...,role-playing-rpg simulator strategy historical...
2,30417,Steel Invaders,indie,action,,"[32902, 37419, 40524, 76263, 96217, 105233, 10...",indie action,indie,action,indie action
3,9236,Top Gun: Combat Zones,simulator,warfare,combat-flight-simulator flight-simulator fligh...,"[14758, 22621, 26431, 28562, 31480, 37289, 467...",simulator warfare,simulator combat-flight-simulator flight-simul...,warfare combat-flight-simulator flight-simulat...,simulator warfare combat-flight-simulator flig...
4,17790,The Treasures of Montezuma 3,puzzle adventure indie arcade,,steam steam-trading-cards digital-distribution...,"[25222, 25646, 26223, 27266, 55173, 55190, 560...",puzzle adventure indie arcade,puzzle adventure indie arcade steam steam-trad...,steam steam-trading-cards digital-distributio...,puzzle adventure indie arcade steam steam-tra...


### 벡터화

In [16]:
cv = CountVectorizer()
cv_matrix_gt = cv.fit_transform(data['genres_themes'])
cv_matrix_gk = cv.fit_transform(data['genres_keywords'])
cv_matrix_tk = cv.fit_transform(data['themes_keywords'])
cv_matrix_gtk = cv.fit_transform(data['genres_themes_keywords'])

cv_matrix_gt.shape, cv_matrix_gk.shape, cv_matrix_tk.shape, cv_matrix_gtk.shape,

((18937, 67), (18937, 2952), (18937, 2955), (18937, 2962))

In [17]:
tv = TfidfVectorizer()
tv_matrix_gt = tv.fit_transform(data['genres_themes'])
tv_matrix_gk = tv.fit_transform(data['genres_keywords'])
tv_matrix_tk = tv.fit_transform(data['themes_keywords'])
tv_matrix_gtk = tv.fit_transform(data['genres_themes_keywords'])

tv_matrix_gt.shape,tv_matrix_gk.shape, tv_matrix_tk.shape, tv_matrix_gtk.shape,

((18937, 67), (18937, 2952), (18937, 2955), (18937, 2962))

### 추천 성능 확인하기

In [18]:
# 점수 저장 데이터 프레임 생성
score_df = data[['name','id']]
score_df

Unnamed: 0,name,id
0,Monster Truck Destruction,10553
1,The Guild 2: Pirates of the European Seas,9323
2,Steel Invaders,30417
3,Top Gun: Combat Zones,9236
4,The Treasures of Montezuma 3,17790
...,...,...
18932,Fantasy Kingdom Simulator,32495
18933,Prince of Persia 3D,3165
18934,Nepenthe,100600
18935,Mario's Time Machine,48133


In [19]:
# 게임별 코사인 유사도 계산 함수
def game_cos_sim(matrix, game_id):
  input_vector = matrix[game_id]
  cos_sim = cosine_similarity(input_vector, matrix)
  return cos_sim

# 게임별 추천 게임 리스트 생성 함수
def reommendation(cos_sim):
  pred_sim_games = list(enumerate(cos_sim[0]))
  sorted_pred_sim_games = sorted(pred_sim_games,key=lambda x:x[1],reverse=True)[1:]

  i=0
  reommendation_list = []

  for item in sorted_pred_sim_games:
    recommend_game_id = data[data.index==item[0]]['id'].values[0]
    reommendation_list.append(recommend_game_id)
    i=i+1
    if i==10:
        break

  return reommendation_list

# precision@10 계산 함수
def count_precision(recommendation_list):
    matchging_games = set(recommendation_list) & set(og_sim_games)
    count = len(matchging_games)
    return count

In [20]:
cv_gt_score_list = []
cv_gk_score_list = []
cv_tk_score_list = []
cv_gtk_score_list = []

tv_gt_score_list = []
tv_gk_score_list = []
tv_tk_score_list = []
tv_gtk_score_list = []


for i in tqdm(range(len(data))):
    og_sim_games = data[data.index == i].similar_games.values[0]
    
    #CountVectorizer
    # genres + themes 
    cv_gt_cos_sim = game_cos_sim(cv_matrix_gt, i)
    cv_gt_recommendation = reommendation(cv_gt_cos_sim)
    num_cv_gt = count_precision(cv_gt_recommendation)
    cv_gt_score_list.append(num_cv_gt)
    
    # genres + keywords
    cv_gk_cos_sim = game_cos_sim(cv_matrix_gk, i)
    cv_gk_recommendation = reommendation(cv_gk_cos_sim)
    num_cv_gk = count_precision(cv_gk_recommendation)
    cv_gk_score_list.append(num_cv_gk)
    
    # themes + keywords
    cv_tk_cos_sim = game_cos_sim(cv_matrix_tk, i)
    cv_tk_recommendation = reommendation(cv_tk_cos_sim)
    num_cv_tk = count_precision(cv_tk_recommendation)
    cv_tk_score_list.append(num_cv_tk)
    
    # genres + themes + keywords
    cv_gtk_cos_sim = game_cos_sim(cv_matrix_gtk, i)
    cv_gtk_recommendation = reommendation(cv_gtk_cos_sim)
    num_cv_gtk = count_precision(cv_gtk_recommendation)
    cv_gtk_score_list.append(num_cv_gtk)
    
    #TfidfVectorizer
    # genres + themes 
    tv_gt_cos_sim = game_cos_sim(tv_matrix_gt, i)
    tv_gt_recommendation = reommendation(tv_gt_cos_sim)
    num_tv_gt = count_precision(tv_gt_recommendation)
    tv_gt_score_list.append(num_tv_gt)
    
    # genres + keywords
    tv_gk_cos_sim = game_cos_sim(tv_matrix_gk, i)
    tv_gk_recommendation = reommendation(tv_gk_cos_sim)
    num_tv_gk = count_precision(tv_gk_recommendation)
    tv_gk_score_list.append(num_tv_gk)
    
    # themes + keywords
    tv_tk_cos_sim = game_cos_sim(cv_matrix_tk, i)
    tv_tk_recommendation = reommendation(cv_tk_cos_sim)
    num_tv_tk = count_precision(cv_tk_recommendation)
    tv_tk_score_list.append(num_cv_tk)
    
    # genres + themes + keywords
    tv_gtk_cos_sim = game_cos_sim(tv_matrix_gtk, i)
    tv_gtk_recommendation = reommendation(tv_gtk_cos_sim)
    num_tv_gtk = count_precision(tv_gtk_recommendation)
    tv_gtk_score_list.append(num_tv_gtk)

100%|██████████| 18937/18937 [46:32<00:00,  6.78it/s]


In [21]:
score_df['cv_gt'] = cv_gt_score_list
score_df['cv_gk'] = cv_gk_score_list
score_df['cv_tk'] = cv_tk_score_list
score_df['cv_gtk'] = cv_gtk_score_list

score_df['tv_gt'] = tv_gt_score_list
score_df['tv_gk'] = tv_gk_score_list
score_df['tv_tk'] = tv_tk_score_list
score_df['tv_gtk'] = tv_gtk_score_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df['cv_gt'] = cv_gt_score_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df['cv_gk'] = cv_gk_score_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df['cv_tk'] = cv_tk_score_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

In [22]:
score_df.describe()

Unnamed: 0,id,cv_gt,cv_gk,cv_tk,cv_gtk,tv_gt,tv_gk,tv_tk,tv_gtk
count,18937.0,18937.0,18937.0,18937.0,18937.0,18937.0,18937.0,18937.0,18937.0
mean,48960.159318,0.206263,0.362412,0.328933,0.429371,0.194751,0.356656,0.328933,0.407879
std,46605.463664,0.49737,0.81991,0.795533,0.866019,0.485954,0.821434,0.795533,0.852862
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,15024.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,31692.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,81180.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
max,294661.0,6.0,8.0,7.0,7.0,5.0,9.0,7.0,9.0


### keywords 데이터 정제하기

In [14]:
# 1. keywords에서 숫자 없애기
data['keywords_no_num'] = data['keywords'].apply(lambda x: re.sub(r'\d+', '', x))

In [15]:
# 2. keywords에서 연도 없애기
def remove_years_except_decades(keywords):
  """
  Removes years from the keywords string, except for those in the decades format (e.g., 1990s).

  Args:
    keywords: A string containing keywords.

  Returns:
    A string with years removed, except for those in the decades format.
  """

  # Define the regular expression to match years
  year_pattern = r"\b(1899|19\d{2}|20\d{2})\b"

  # Define the regular expression to match decades
  decade_pattern = r"\b(\d{3})s\b"

  # Remove years that are not in the decades format
  keywords_without_years = re.sub(year_pattern, "", keywords)

  # Keep years in the decades format
  keywords_with_decades = re.sub(decade_pattern, r"\1", keywords_without_years)

  return keywords_with_decades

data["keywords_no_year"] = data["keywords"].apply(remove_years_except_decades)

In [16]:
# 3. keywords에서 연도, 연대 없애기
def remove_decades(keywords):
  """
  Removes the specified keywords from the keywords column.

  Args:
    keywords: A string containing the keywords to remove.

  Returns:
    A string with the specified keywords removed.
  """

  for keyword in ['1990s', '1910s', '1920s', '1930s', '1940s', '1950s', '1960s', '1970s',
                   '1980s', '1990s', '2000s']:
    keywords = keywords.replace(keyword, '')

  return keywords

data['keywords_no_decade'] = data['keywords_no_year'].apply(remove_decades)

In [17]:
data

Unnamed: 0,id,name,genres,themes,keywords,similar_games,genres_themes,genres_keywords,themes_keywords,genres_themes_keywords,keywords_no_num,keywords_no_year,keywords_no_decade
0,10553,Monster Truck Destruction,racing,action,,"[57296, 80382, 82090, 87622, 100800, 109292, 1...",racing action,racing,action,racing action,,,
1,9323,The Guild 2: Pirates of the European Seas,role-playing-rpg simulator strategy,historical sandbox,pirates medieval management gamersgate naval-w...,"[18623, 23345, 26574, 31515, 35371, 36346, 365...",role-playing-rpg simulator strategy historical...,role-playing-rpg simulator strategy pirates me...,historical sandbox pirates medieval management...,role-playing-rpg simulator strategy historical...,pirates medieval management gamersgate naval-w...,pirates medieval management gamersgate naval-w...,pirates medieval management gamersgate naval-w...
2,30417,Steel Invaders,indie,action,,"[32902, 37419, 40524, 76263, 96217, 105233, 10...",indie action,indie,action,indie action,,,
3,9236,Top Gun: Combat Zones,simulator,warfare,combat-flight-simulator flight-simulator fligh...,"[14758, 22621, 26431, 28562, 31480, 37289, 467...",simulator warfare,simulator combat-flight-simulator flight-simul...,warfare combat-flight-simulator flight-simulat...,simulator warfare combat-flight-simulator flig...,combat-flight-simulator flight-simulator fligh...,combat-flight-simulator flight-simulator fligh...,combat-flight-simulator flight-simulator fligh...
4,17790,The Treasures of Montezuma 3,puzzle adventure indie arcade,,steam steam-trading-cards digital-distribution...,"[25222, 25646, 26223, 27266, 55173, 55190, 560...",puzzle adventure indie arcade,puzzle adventure indie arcade steam steam-trad...,steam steam-trading-cards digital-distributio...,puzzle adventure indie arcade steam steam-tra...,steam steam-trading-cards digital-distribution...,steam steam-trading-cards digital-distribution...,steam steam-trading-cards digital-distribution...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18932,32495,Fantasy Kingdom Simulator,simulator strategy indie,,,"[17130, 17519, 31515, 33603, 33646, 36269, 515...",simulator strategy indie,simulator strategy indie,,simulator strategy indie,,,
18933,3165,Prince of Persia 3D,adventure,action stealth,parkour death mythology action-adventure egypt...,"[836, 10776, 26950, 28168, 28309, 30245, 36198...",adventure action stealth,adventure parkour death mythology action-adven...,action stealth parkour death mythology action-...,adventure action stealth parkour death mytholo...,parkour death mythology action-adventure egypt...,parkour death mythology action-adventure egypt...,parkour death mythology action-adventure egypt...
18934,100600,Nepenthe,role-playing-rpg adventure indie,action horror comedy,weird surreal hand-drawn,"[25311, 25646, 28309, 35994, 80916, 96217, 105...",role-playing-rpg adventure indie action horror...,role-playing-rpg adventure indie weird surreal...,action horror comedy weird surreal hand-drawn,role-playing-rpg adventure indie action horror...,weird surreal hand-drawn,weird surreal hand-drawn,weird surreal hand-drawn
18935,48133,Mario's Time Machine,puzzle strategy,educational,retroachievements,"[236, 27792, 87056, 95776, 109129, 206719, 242...",puzzle strategy educational,puzzle strategy retroachievements,educational retroachievements,puzzle strategy educational retroachievements,retroachievements,retroachievements,retroachievements


In [18]:
# 1. genres + themes + keywords_no_num
data['genres_themes_keywords_no_num'] = data['genres'] + " " + data['themes'] + " " + data['keywords_no_num']

# 2. genres + themes + keywords_no_year
data['genres_themes_keywords_no_year'] = data['genres'] + " " + data['themes'] + " " + data['keywords_no_year']

# 3. genres + themes + keywords_no_decade
data['genres_themes_keywords_no_decade'] = data['genres'] + " " + data['themes'] + " " + data['keywords_no_decade']

data.head()

Unnamed: 0,id,name,genres,themes,keywords,similar_games,genres_themes,genres_keywords,themes_keywords,genres_themes_keywords,keywords_no_num,keywords_no_year,keywords_no_decade,genres_themes_keywords_no_num,genres_themes_keywords_no_year,genres_themes_keywords_no_decade
0,10553,Monster Truck Destruction,racing,action,,"[57296, 80382, 82090, 87622, 100800, 109292, 1...",racing action,racing,action,racing action,,,,racing action,racing action,racing action
1,9323,The Guild 2: Pirates of the European Seas,role-playing-rpg simulator strategy,historical sandbox,pirates medieval management gamersgate naval-w...,"[18623, 23345, 26574, 31515, 35371, 36346, 365...",role-playing-rpg simulator strategy historical...,role-playing-rpg simulator strategy pirates me...,historical sandbox pirates medieval management...,role-playing-rpg simulator strategy historical...,pirates medieval management gamersgate naval-w...,pirates medieval management gamersgate naval-w...,pirates medieval management gamersgate naval-w...,role-playing-rpg simulator strategy historical...,role-playing-rpg simulator strategy historical...,role-playing-rpg simulator strategy historical...
2,30417,Steel Invaders,indie,action,,"[32902, 37419, 40524, 76263, 96217, 105233, 10...",indie action,indie,action,indie action,,,,indie action,indie action,indie action
3,9236,Top Gun: Combat Zones,simulator,warfare,combat-flight-simulator flight-simulator fligh...,"[14758, 22621, 26431, 28562, 31480, 37289, 467...",simulator warfare,simulator combat-flight-simulator flight-simul...,warfare combat-flight-simulator flight-simulat...,simulator warfare combat-flight-simulator flig...,combat-flight-simulator flight-simulator fligh...,combat-flight-simulator flight-simulator fligh...,combat-flight-simulator flight-simulator fligh...,simulator warfare combat-flight-simulator flig...,simulator warfare combat-flight-simulator flig...,simulator warfare combat-flight-simulator flig...
4,17790,The Treasures of Montezuma 3,puzzle adventure indie arcade,,steam steam-trading-cards digital-distribution...,"[25222, 25646, 26223, 27266, 55173, 55190, 560...",puzzle adventure indie arcade,puzzle adventure indie arcade steam steam-trad...,steam steam-trading-cards digital-distributio...,puzzle adventure indie arcade steam steam-tra...,steam steam-trading-cards digital-distribution...,steam steam-trading-cards digital-distribution...,steam steam-trading-cards digital-distribution...,puzzle adventure indie arcade steam steam-tra...,puzzle adventure indie arcade steam steam-tra...,puzzle adventure indie arcade steam steam-tra...


### 벡터화

In [19]:
cv_matrix_gtkn = cv.fit_transform(data['genres_themes_keywords_no_num'])
cv_matrix_gtky = cv.fit_transform(data['genres_themes_keywords_no_year'])
cv_matrix_gtkd = cv.fit_transform(data['genres_themes_keywords_no_decade'])

cv_matrix_gtkn.shape, cv_matrix_gtky.shape, cv_matrix_gtkd.shape

((18937, 2892), (18937, 2933), (18937, 2923))

In [20]:
cv_gtkn_score_list = []
cv_gtky_score_list = []
cv_gtkd_score_list = []

for i in tqdm(range(len(data))):
    og_sim_games = data[data.index == i].similar_games.values[0]
    
    # genres + themes + keywords_no_num
    cv_gtkn_cos_sim = game_cos_sim(cv_matrix_gtkn, i)
    cv_gtkn_recommendation = reommendation(cv_gtkn_cos_sim)
    num_cv_gtkn = count_precision(cv_gtkn_recommendation)
    cv_gtkn_score_list.append(num_cv_gtkn)

    # genres + themes + keywords_no_year
    cv_gtky_cos_sim = game_cos_sim(cv_matrix_gtky, i)
    cv_gtky_recommendation = reommendation(cv_gtky_cos_sim)
    num_cv_gtky = count_precision(cv_gtky_recommendation)
    cv_gtky_score_list.append(num_cv_gtky)
    
    # genres + themes + keywords_no_decade
    cv_gtkd_cos_sim = game_cos_sim(cv_matrix_gtkd, i)
    cv_gtkd_recommendation = reommendation(cv_gtkd_cos_sim)
    num_cv_gtkd = count_precision(cv_gtkd_recommendation)
    cv_gtkd_score_list.append(num_cv_gtkd)

100%|██████████| 18937/18937 [22:16<00:00, 14.17it/s]


In [21]:
score_df['cv_gtkn'] = cv_gtkn_score_list
score_df['cv_gtky'] = cv_gtky_score_list
score_df['cv_gtkd'] = cv_gtkd_score_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df['cv_gtkn'] = cv_gtkn_score_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df['cv_gtky'] = cv_gtky_score_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df['cv_gtkd'] = cv_gtkd_score_list


In [27]:
score_df

Unnamed: 0,name,id,cv_gt,cv_gk,cv_tk,cv_gtk,tv_gt,tv_gk,tv_tk,tv_gtk,cv_gtkn,cv_gtky,cv_gtkd
0,Monster Truck Destruction,10553,1,1,0,1,1,1,0,1,1,1,1
1,The Guild 2: Pirates of the European Seas,9323,0,0,0,0,0,0,0,0,0,0,0
2,Steel Invaders,30417,0,0,0,0,0,0,0,0,0,0,0
3,Top Gun: Combat Zones,9236,0,0,0,0,0,0,0,0,0,0,0
4,The Treasures of Montezuma 3,17790,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18932,Fantasy Kingdom Simulator,32495,1,1,0,1,1,1,0,1,1,1,1
18933,Prince of Persia 3D,3165,0,0,0,0,0,1,0,1,0,0,0
18934,Nepenthe,100600,0,0,0,0,0,0,0,0,0,0,0
18935,Mario's Time Machine,48133,0,0,0,0,0,0,0,0,0,0,0


In [26]:
score_df.describe()

Unnamed: 0,id,cv_gt,cv_gk,cv_tk,cv_gtk,tv_gt,tv_gk,tv_tk,tv_gtk,cv_gtkn,cv_gtky,cv_gtkd
count,18937.0,18937.0,18937.0,18937.0,18937.0,18937.0,18937.0,18937.0,18937.0,18937.0,18937.0,18937.0
mean,48960.159318,0.206263,0.362412,0.328933,0.429371,0.194751,0.356656,0.328933,0.407879,0.427312,0.430427,0.430058
std,46605.463664,0.49737,0.81991,0.795533,0.866019,0.485954,0.821434,0.795533,0.852862,0.865726,0.869634,0.869604
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,15024.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,31692.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,81180.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
max,294661.0,6.0,8.0,7.0,7.0,5.0,9.0,7.0,9.0,8.0,8.0,8.0


In [28]:
score_df.to_csv("../data/igdb_pc_score.csv")