In [56]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import metrics
import pickle

In [3]:
gaming_df = pd.read_csv('online_gaming_behavior_dataset.csv')
gaming_df.head()

Unnamed: 0,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
0,9000,43,Male,Other,Strategy,16.271119,0,Medium,6,108,79,25,Medium
1,9001,29,Female,USA,Strategy,5.525961,0,Medium,5,144,11,10,Medium
2,9002,22,Female,USA,Sports,8.223755,0,Easy,16,142,35,41,High
3,9003,35,Male,USA,Action,5.265351,1,Easy,9,85,57,47,Medium
4,9004,33,Male,Europe,Action,15.531945,0,Medium,2,131,95,37,Medium


# Data Exploration and Cleaning

In [4]:
gaming_df.shape

(40034, 13)

In [5]:
gaming_df.describe()

Unnamed: 0,PlayerID,Age,PlayTimeHours,InGamePurchases,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked
count,40034.0,40034.0,40034.0,40034.0,40034.0,40034.0,40034.0,40034.0
mean,29016.5,31.992531,12.024365,0.200854,9.471774,94.792252,49.655568,24.526477
std,11556.964675,10.043227,6.914638,0.400644,5.763667,49.011375,28.588379,14.430726
min,9000.0,15.0,0.000115,0.0,0.0,10.0,1.0,0.0
25%,19008.25,23.0,6.067501,0.0,4.0,52.0,25.0,12.0
50%,29016.5,32.0,12.008002,0.0,9.0,95.0,49.0,25.0
75%,39024.75,41.0,17.963831,0.0,14.0,137.0,74.0,37.0
max,49033.0,49.0,23.999592,1.0,19.0,179.0,99.0,49.0


In [6]:
gaming_df['Location'].unique()

array(['Other', 'USA', 'Europe', 'Asia'], dtype=object)

In [7]:
gaming_df['GameGenre'].unique()

array(['Strategy', 'Sports', 'Action', 'RPG', 'Simulation'], dtype=object)

In [8]:
gaming_df['GameDifficulty'].unique()

array(['Medium', 'Easy', 'Hard'], dtype=object)

In [9]:
gaming_df['EngagementLevel'].unique()

array(['Medium', 'High', 'Low'], dtype=object)

In [10]:
gaming_df['Location'].replace({'USA': 1, 'Europe': 2, 'Asia': 3, 'Other': 4}, inplace = True)

In [11]:
gaming_df['GameGenre'].replace({'Strategy': 1, 'Sports': 2, 'Action': 3, 'RPG': 4, 'Simulation': 5}, inplace = True)

In [12]:
gaming_df['GameDifficulty'].replace({'Easy': 1, 'Medium': 2, 'Hard': 3}, inplace = True)

In [13]:
gaming_df['EngagementLevel'].replace({'Low': 1, 'Medium': 2, 'High': 3}, inplace = True)

In [14]:
gaming_df['Gender'].replace({'Male': 0, 'Female': 1}, inplace = True)

In [15]:
gaming_df.head()

Unnamed: 0,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
0,9000,43,0,4,1,16.271119,0,2,6,108,79,25,2
1,9001,29,1,1,1,5.525961,0,2,5,144,11,10,2
2,9002,22,1,1,2,8.223755,0,1,16,142,35,41,3
3,9003,35,0,1,3,5.265351,1,1,9,85,57,47,2
4,9004,33,0,2,3,15.531945,0,2,2,131,95,37,2


In [16]:
gaming_df.isnull().sum()

Unnamed: 0,0
PlayerID,0
Age,0
Gender,0
Location,0
GameGenre,0
PlayTimeHours,0
InGamePurchases,0
GameDifficulty,0
SessionsPerWeek,0
AvgSessionDurationMinutes,0


In [17]:
gaming_df = gaming_df.drop(columns = ['PlayerID'])

# Feature Selection

In [18]:
gaming_df.corr()

Unnamed: 0,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
Age,1.0,0.002075,-0.003389,0.002217,0.002462,-0.000186,-0.00276,0.008777,-0.002269,0.001353,-0.0011,0.000824
Gender,0.002075,1.0,-0.002909,0.001076,-0.006514,-0.006198,-0.001878,0.006491,0.003175,-0.006645,-0.003772,0.004978
Location,-0.003389,-0.002909,1.0,0.007208,0.007499,0.000564,-0.001683,0.000185,-0.002459,0.003697,-0.00529,-0.000662
GameGenre,0.002217,0.001076,0.007208,1.0,-0.006685,-0.00684,0.004431,-0.00721,-0.008959,-0.008736,0.000672,-0.006592
PlayTimeHours,0.002462,-0.006514,0.007499,-0.006685,1.0,-0.006067,0.001636,-0.003655,-0.001925,-0.005152,0.003913,-0.001849
InGamePurchases,-0.000186,-0.006198,0.000564,-0.00684,-0.006067,1.0,0.00141,0.005132,-0.003059,0.006524,9.8e-05,0.008209
GameDifficulty,-0.00276,-0.001878,-0.001683,0.004431,0.001636,0.00141,1.0,0.005058,0.002374,0.006059,-0.006244,0.005057
SessionsPerWeek,0.008777,0.006491,0.000185,-0.00721,-0.003655,0.005132,0.005058,1.0,-0.00062,0.003257,0.003187,0.605996
AvgSessionDurationMinutes,-0.002269,0.003175,-0.002459,-0.008959,-0.001925,-0.003059,0.002374,-0.00062,1.0,0.001368,-0.002227,0.476698
PlayerLevel,0.001353,-0.006645,0.003697,-0.008736,-0.005152,0.006524,0.006059,0.003257,0.001368,1.0,0.006343,0.059315


In [19]:
gaming_df = gaming_df.drop(columns = ['Age', 'Gender', 'Location', 'GameGenre', 'PlayTimeHours', 'InGamePurchases', 'GameDifficulty'])
gaming_df.head()

Unnamed: 0,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
0,6,108,79,25,2
1,5,144,11,10,2
2,16,142,35,41,3
3,9,85,57,47,2
4,2,131,95,37,2


In [20]:
X = gaming_df.drop(columns = ['EngagementLevel'])
y = gaming_df['EngagementLevel']

In [21]:
X.shape

(40034, 4)

In [22]:
y.shape

(40034,)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

# Model Building

In [24]:
from sklearn.neighbors import KNeighborsClassifier

KNN = KNeighborsClassifier(n_neighbors = 5)
KNN.fit(X_train, y_train)

In [25]:
y_pred = KNN.predict(X_test)

In [26]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[1554  442   60]
 [ 234 3440  151]
 [  67  281 1778]]


In [27]:
print(classification_report(y_test, y_pred, target_names = ['low', 'medium', 'high']))

              precision    recall  f1-score   support

         low       0.84      0.76      0.79      2056
      medium       0.83      0.90      0.86      3825
        high       0.89      0.84      0.86      2126

    accuracy                           0.85      8007
   macro avg       0.85      0.83      0.84      8007
weighted avg       0.85      0.85      0.84      8007



In [33]:
score = KNN.score(X_test, y_test)
print(score)

0.8457599600349694


In [28]:
from sklearn.naive_bayes import GaussianNB

NB = GaussianNB()
NB.fit(X_train, y_train)

In [36]:
y_pred = NB.predict(X_test)

In [37]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[1399  602   55]
 [ 101 3673   51]
 [  45  390 1691]]


In [38]:
print(classification_report(y_test, y_pred, target_names = ['low', 'medium', 'high']))

              precision    recall  f1-score   support

         low       0.91      0.68      0.78      2056
      medium       0.79      0.96      0.87      3825
        high       0.94      0.80      0.86      2126

    accuracy                           0.84      8007
   macro avg       0.88      0.81      0.83      8007
weighted avg       0.86      0.84      0.84      8007



In [39]:
score = NB.score(X_test, y_test)
print(score)

0.8446359435493943


In [40]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train, y_train)

In [41]:
y_pred = linreg.predict(X_test)

In [43]:
score = linreg.score(X_test, y_test)
print(score)

0.6056882718393483


In [44]:
from sklearn.tree import DecisionTreeClassifier

DTC = DecisionTreeClassifier(criterion = 'gini', random_state = 0)
DTC.fit(X_train, y_train)

In [45]:
y_pred = DTC.predict(X_test)

In [46]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[1662  259  135]
 [ 238 3357  230]
 [ 122  251 1753]]


In [47]:
print(classification_report(y_test, y_pred, target_names = ['low', 'medium', 'high']))

              precision    recall  f1-score   support

         low       0.82      0.81      0.82      2056
      medium       0.87      0.88      0.87      3825
        high       0.83      0.82      0.83      2126

    accuracy                           0.85      8007
   macro avg       0.84      0.84      0.84      8007
weighted avg       0.85      0.85      0.85      8007



In [48]:
score = DTC.score(X_test, y_test)
print(score)

0.8457599600349694


In [49]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier (max_depth = 10, random_state= 0, n_estimators = 5)
RFC.fit(X_train, y_train)

In [50]:
y_pred = RFC.predict(X_test)

In [51]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[1786  206   64]
 [ 104 3628   93]
 [  55  178 1893]]


In [52]:
print(classification_report(y_test, y_pred, target_names = ['low', 'medium', 'high']))

              precision    recall  f1-score   support

         low       0.92      0.87      0.89      2056
      medium       0.90      0.95      0.93      3825
        high       0.92      0.89      0.91      2126

    accuracy                           0.91      8007
   macro avg       0.92      0.90      0.91      8007
weighted avg       0.91      0.91      0.91      8007



In [53]:
score = RFC.score(X_test, y_test)
print(score)

0.9125764955663794


In [57]:
with open('RFC_model.pkl', 'wb') as file:
  pickle.dump(RFC, file)

## Train on AWS

In [2]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.model import Model

In [5]:
sagemaker_session = sagemaker.Session()

In [3]:
role = get_execution_role()

In [6]:
role

'arn:aws:iam::484907493357:role/service-role/AmazonSageMaker-ExecutionRole-20240826T174800'

In [7]:
train_input = sagemaker_session.upload_data('data')

In [8]:
train_input

's3://sagemaker-us-east-1-484907493357/data'

In [9]:
from sagemaker.sklearn.estimator import SKLearn

In [15]:
script_path = 'train.py'

sklearn = SKLearn(
    entry_point = script_path,
    instance_type = 'ml.m4.xlarge',
    framework_version = '0.20.0',
    py_version = 'py3',
    role = role,
    sagemaker_session = sagemaker_session)

In [16]:
sklearn.fit({'train': train_input})

INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2024-09-02-02-47-11-770


2024-09-02 02:47:13 Starting - Starting the training job...
2024-09-02 02:47:27 Starting - Preparing the instances for training...
2024-09-02 02:48:12 Downloading - Downloading the training image......
2024-09-02 02:49:08 Training - Training image download completed. Training in progress.
2024-09-02 02:49:08 Uploading - Uploading generated training model[34m2024-09-02 02:49:03,571 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-09-02 02:49:03,574 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-09-02 02:49:03,583 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-09-02 02:49:03,772 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-09-02 02:49:03,784 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-09-02 02:49:03,795 sagemaker-training-toolkit INFO     No

## Deploy on AWS

In [27]:
model = Model(
    image_uri = '484907493357.dkr.ecr.us-east-1.amazonaws.com/finalproject:v1',
    role = role,
    model_data = None, 
    env = {"SAGEMAKER_PROGRAM": "serve.py"} 
)
predictor = model.deploy(
    initial_instance_count = 1,
    instance_type = 'ml.m4.xlarge',
    container_startup_health_check_timeout = 600 
)

INFO:sagemaker:Creating model with name: finalproject-2024-09-02-06-58-46-048
INFO:sagemaker:Creating endpoint-config with name finalproject-2024-09-02-06-58-46-711
INFO:sagemaker:Creating endpoint with name finalproject-2024-09-02-06-58-46-711


--------------------------*

UnexpectedStatusException: Error hosting endpoint finalproject-2024-09-02-06-58-46-711: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint.. Try changing the instance type or reference the troubleshooting page https://docs.aws.amazon.com/sagemaker/latest/dg/async-inference-troubleshooting.html