In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import joblib

# file path
file_path = r'C:\Users\allib\OneDrive\Desktop\MS Data Science\ANA680\Week4\final project\vgsales.csv'

# load the CSV file
df = pd.read_csv(file_path)

# preview my data
print("First 5 rows of the dataset:")
print(df.head())


First 5 rows of the dataset:
   Rank                      Name Platform    Year         Genre Publisher  \
0     1                Wii Sports      Wii  2006.0        Sports  Nintendo   
1     2         Super Mario Bros.      NES  1985.0      Platform  Nintendo   
2     3            Mario Kart Wii      Wii  2008.0        Racing  Nintendo   
3     4         Wii Sports Resort      Wii  2009.0        Sports  Nintendo   
4     5  Pokemon Red/Pokemon Blue       GB  1996.0  Role-Playing  Nintendo   

   NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  
0     41.49     29.02      3.77         8.46         82.74  
1     29.08      3.58      6.81         0.77         40.24  
2     15.85     12.88      3.79         3.31         35.82  
3     15.75     11.01      3.28         2.96         33.00  
4     11.27      8.89     10.22         1.00         31.37  


In [47]:
# basic data exploration
print("\nDataset info:")
print(df.info())

print("\nDescriptive statistics:")
print(df.describe())

print("\nMissing values per column:")
print(df.isnull().sum())

print("\nNumber of unique values per column:")
print(df.nunique())



Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16598 non-null  int64  
 1   Name          16598 non-null  object 
 2   Platform      16598 non-null  object 
 3   Year          16327 non-null  float64
 4   Genre         16598 non-null  object 
 5   Publisher     16540 non-null  object 
 6   NA_Sales      16598 non-null  float64
 7   EU_Sales      16598 non-null  float64
 8   JP_Sales      16598 non-null  float64
 9   Other_Sales   16598 non-null  float64
 10  Global_Sales  16598 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB
None

Descriptive statistics:
               Rank          Year      NA_Sales      EU_Sales      JP_Sales  \
count  16598.000000  16327.000000  16598.000000  16598.000000  16598.000000   
mean    8300.605254   2006.406443      0.264667      0.146652      

In [49]:
# rop rows with missing required fields
df = df.dropna(subset=['Platform', 'Genre', 'Publisher', 'Year', 'NA_Sales', 'EU_Sales', 'JP_Sales'])


In [51]:
# create the target variable 'Top_Region'
def get_top_region(row):
    return max(['NA_Sales', 'EU_Sales', 'JP_Sales'], key=lambda region: row[region])

df['Top_Region'] = df.apply(get_top_region, axis=1)


In [53]:
# shouldn't use future sales to predict
df = df.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'])


In [55]:
# prepare feature and target sets
X = df[['Platform', 'Genre', 'Publisher', 'Year']]
y = df['Top_Region']


In [57]:
# one-hot encode categorical features
X_encoded = pd.get_dummies(X, columns=['Platform', 'Genre', 'Publisher'])

# save feature column names to CSV for Flask input mapping
X_encoded.columns.to_series().to_csv('model_columns.csv', index=False, header=False)



In [59]:
# encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [61]:
# train-test split
X_train, X_val, y_train, y_val = train_test_split(
    X_encoded, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded)


In [63]:
# train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [65]:
# evaluate model performance
y_val_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, target_names=le.classes_))


Validation Accuracy: 0.7876432078559739

Classification Report:
              precision    recall  f1-score   support

    EU_Sales       0.67      0.55      0.60       705
    JP_Sales       0.76      0.72      0.74      1197
    NA_Sales       0.82      0.87      0.84      2986

    accuracy                           0.79      4888
   macro avg       0.75      0.71      0.73      4888
weighted avg       0.78      0.79      0.78      4888



In [67]:
# save the model and label encoder
joblib.dump(model, 'vgsales_model.pkl')
joblib.dump(le, 'label_encoder.pkl')

['label_encoder.pkl']