In [1]:
# importação das bibliotecas necessárias
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [2]:
# carregando o dataset
df = pd.read_csv("vgsales.csv")

In [3]:
# análise e preparação dos dados
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [4]:
print(df.columns)

Index(['Rank', 'Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'],
      dtype='object')


In [5]:
# 2. Criar coluna 'Categoria' com base em 'Ano'
df['Category'] = 'Moderno'
df.loc[df['Year'] < 1990, 'Category'] = 'Retro'

In [6]:
# 3. Converter 'Ano' em datetime (somente ano)
df['Year'] = pd.to_datetime(df['Year'], format='%Y', errors='coerce').dt.to_period('Y')

In [7]:
def categorize_sales(sales):
    """ Categoriza as vendas de jogos em 'Ótimo', 'Boa', 'Ruim' ou 'Pessima'.
        Esta versão usa quantis para definir limites para categorização mais justa,
        evitando cortes arbitrários. Ele também lida com casos extremos com mais elegância.
    """
    if sales < 1: #Handle negative and zero sales
        return 'Pessima'
    elif sales >= 80 : #High Sales
        return 'Otimo'
    elif sales >= df['Global_Sales'].quantile(0.75): #Top 25%
        return 'Boa'
    elif sales >= df['Global_Sales'].quantile(0.25): #Next 50%
        return 'Ruim'
    else: #Bottom 25%
        return 'Pessima'

# ... (outro código para limpar e preparar seu dataframe) ...

df['Sales_Category'] = df['Global_Sales'].apply(categorize_sales)    


In [8]:
# 5. Preencha as colunas de vendas com 0,0 ou 0,00 com 1
sales_cols = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
df[sales_cols] = df[sales_cols].replace(0, 1)
df[sales_cols] = df[sales_cols].replace(0.0, 1)
df[sales_cols] = df[sales_cols].replace(0.00, 1)

In [9]:
# 6. Remover valores NaN de 'Ano' e 'Editor'
df.dropna(subset=['Year', 'Publisher'], inplace=True)

In [10]:
#Display o DataFrame atualizado
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Category,Sales_Category
0,1,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74,Moderno,Otimo
1,2,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,Retro,Boa
2,3,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82,Moderno,Boa
3,4,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0,Moderno,Boa
4,5,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,Moderno,Boa
