In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# creating supervised learning imports

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
)

<IPython.core.display.Javascript object>

In [3]:
# Due to time constraints and such a large data set, I subsampled my data to work with a smaller set for preprocessing
final_df = pd.read_csv("./cleaned_data.csv")
final_df = final_df.sample(100000, random_state=13)
final_df.head()

Unnamed: 0,event_id,app_id,is_active,device_id,group,8,9,10,11,12,...,device_model_魅蓝,device_model_魅蓝2,device_model_魅蓝NOTE,device_model_魅蓝Note 2,device_model_魅蓝metal,device_model_麦芒3,device_model_麦芒3S,device_model_麦芒4,device_model_黄金斗士A8,device_model_黄金斗士Note8
358014,671542,-6172775651801283024,0,-5591315370762637500,M22-,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
254503,273588,1883678791934985414,1,6414313316267266281,M23-26,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91844,54515,-6590473556670600053,0,4189762977638537239,F29-32,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
411452,1438557,-2320783822570582843,0,8268943186752324161,M32-38,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
293071,375667,4373268368372483132,0,4917614238706511620,M29-31,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [4]:
# Need to convert these values to numeric values
# Considering:
# M22- : 0
# M23-26 : 1
# M27-28 : 2
# M29-31 : 3
# M32-38 : 4
# M39+ : 5
# F23- : 6
# F24-26 : 7
# F27-28 : 8
# F29-32 : 9
# F33-42 : 10
# F43+ : 11
final_df["group"].value_counts()

M32-38    18475
M39+      16530
M29-31    12766
M23-26    11633
M27-28     7644
F33-42     7031
M22-       6669
F43+       5263
F29-32     4755
F27-28     3138
F23-       3086
F24-26     3010
Name: group, dtype: int64

<IPython.core.display.Javascript object>

In [5]:
# Converting object data to numeric data in group column
to_num = {
    "M22-": 0,
    "M23-26": 1,
    "M27-28": 2,
    "M29-31": 3,
    "M32-38": 4,
    "M39+": 5,
    "F23-": 6,
    "F24-26": 7,
    "F27-28": 8,
    "F29-32": 9,
    "F33-42": 10,
    "F43+": 11,
}

# Created new column for numeric group, can now drop old group column
final_df["num_group"] = final_df["group"].map(to_num)
final_df = final_df.drop(columns=["group"])
final_df.head()

Unnamed: 0,event_id,app_id,is_active,device_id,8,9,10,11,12,13,...,device_model_魅蓝2,device_model_魅蓝NOTE,device_model_魅蓝Note 2,device_model_魅蓝metal,device_model_麦芒3,device_model_麦芒3S,device_model_麦芒4,device_model_黄金斗士A8,device_model_黄金斗士Note8,num_group
358014,671542,-6172775651801283024,0,-5591315370762637500,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
254503,273588,1883678791934985414,1,6414313316267266281,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
91844,54515,-6590473556670600053,0,4189762977638537239,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
411452,1438557,-2320783822570582843,0,8268943186752324161,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
293071,375667,4373268368372483132,0,4917614238706511620,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


<IPython.core.display.Javascript object>

In [6]:
num_cols = ["app_id", "device_id", "event_id"]

<IPython.core.display.Javascript object>

In [7]:
# splitting up the data and choosing num_group as target variable
X = final_df.drop(columns=["num_group"])
y = final_df["num_group"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=13, stratify=y
)

<IPython.core.display.Javascript object>

In [8]:
# set up preprocessing for pipeline (used to scale features)
preprocessing = ColumnTransformer(
    [("scale", StandardScaler(), num_cols),], remainder="passthrough",
)

<IPython.core.display.Javascript object>

# Original Models

### KNearestNeighbor Classifier

In [9]:
pipeline = Pipeline(
    [("preprocessing", preprocessing), ("knn", KNeighborsClassifier())], verbose=True
)

pipeline.fit(X_train, y_train)

[Pipeline] ..... (step 1 of 2) Processing preprocessing, total=   0.2s
[Pipeline] ............... (step 2 of 2) Processing knn, total=  17.4s


Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('scale', StandardScaler(),
                                                  ['app_id', 'device_id',
                                                   'event_id'])])),
                ('knn', KNeighborsClassifier())],
         verbose=True)

<IPython.core.display.Javascript object>

In [10]:
# Due to shortage of time I decreased the amount of parameters and lowered the cv
grid = {
    "knn__n_neighbors": [20, 30, 40],
    "knn__weights": ["distance", "uniform"],
}

model = GridSearchCV(pipeline, grid, n_jobs=-1, verbose=2, cv=3)
model.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  18 | elapsed: 24.0min remaining: 62.3min
[Parallel(n_jobs=-1)]: Done  15 out of  18 | elapsed: 41.7min remaining:  8.3min
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed: 41.8min finished


[Pipeline] ..... (step 1 of 2) Processing preprocessing, total=   0.3s
[Pipeline] ............... (step 2 of 2) Processing knn, total=  18.0s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('scale',
                                                                         StandardScaler(),
                                                                         ['app_id',
                                                                          'device_id',
                                                                          'event_id'])])),
                                       ('knn', KNeighborsClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'knn__n_neighbors': [20, 30, 40],
                         'knn__weights': ['distance', 'uniform']},
             verbose=2)

<IPython.core.display.Javascript object>

In [11]:
# It seems like the higher the amount of n_neighbors the better the score, for next test, will increase n_neighbors
model.best_params_

{'knn__n_neighbors': 40, 'knn__weights': 'distance'}

<IPython.core.display.Javascript object>

In [12]:
# Model appears to be extremely overfitting
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 1.0
Test score: 0.22205


<IPython.core.display.Javascript object>

In [13]:
y_pred = model.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=[
        "Actually M22-",
        "Actually M23-26",
        "Actually M27-28",
        "Actually M29-31",
        "Actually M32-38",
        "Actually M39+",
        "Actually F23-",
        "Actually F24-26",
        "Actually F27-28",
        "Actually F29-32",
        "Actually F33-42",
        "Actually F43+",
    ],
    columns=[
        "Predicted M22-",
        "Predicted M23-26",
        "Predicted M27-28",
        "Predicted M29-31",
        "Predicted M32-38",
        "Predicted M39+",
        "Predicted F23-",
        "Predicted F24-26",
        "Predicted F27-28",
        "Predicted F29-32",
        "Predicted F33-42",
        "Predicted F43+",
    ],
)
con_mat.style.background_gradient(axis=None)

Unnamed: 0,Predicted M22-,Predicted M23-26,Predicted M27-28,Predicted M29-31,Predicted M32-38,Predicted M39+,Predicted F23-,Predicted F24-26,Predicted F27-28,Predicted F29-32,Predicted F33-42,Predicted F43+
Actually M22-,176,219,40,154,434,228,29,8,5,9,22,10
Actually M23-26,129,441,61,296,804,456,22,12,11,16,47,32
Actually M27-28,63,219,88,197,535,334,17,9,6,13,36,12
Actually M29-31,98,285,85,485,844,614,29,6,10,19,58,20
Actually M32-38,91,369,83,379,1608,955,24,9,18,26,92,41
Actually M39+,59,227,61,316,1083,1401,14,6,10,15,72,42
Actually F23-,42,93,20,55,215,109,55,3,3,5,9,8
Actually F24-26,36,84,17,65,208,125,25,9,5,9,13,6
Actually F27-28,20,87,13,73,218,142,14,3,14,8,23,12
Actually F29-32,19,97,22,108,324,276,20,6,2,31,26,20


<IPython.core.display.Javascript object>

In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.22      0.13      0.17      1334
           1       0.19      0.19      0.19      2327
           2       0.16      0.06      0.08      1529
           3       0.21      0.19      0.20      2553
           4       0.23      0.44      0.30      3695
           5       0.26      0.42      0.32      3306
           6       0.20      0.09      0.12       617
           7       0.11      0.01      0.03       602
           8       0.15      0.02      0.04       627
           9       0.18      0.03      0.05       951
          10       0.19      0.07      0.10      1406
          11       0.13      0.03      0.05      1053

    accuracy                           0.22     20000
   macro avg       0.18      0.14      0.14     20000
weighted avg       0.20      0.22      0.19     20000



<IPython.core.display.Javascript object>

### Random Forest Classifier

In [9]:
pipeline = Pipeline(
    [("preprocessing", preprocessing), ("rf", RandomForestClassifier())], verbose=True
)

pipeline.fit(X_train, y_train)

[Pipeline] ..... (step 1 of 2) Processing preprocessing, total=   0.8s
[Pipeline] ................ (step 2 of 2) Processing rf, total= 6.3min


Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('scale', StandardScaler(),
                                                  ['app_id', 'device_id',
                                                   'event_id'])])),
                ('rf', RandomForestClassifier())],
         verbose=2)

<IPython.core.display.Javascript object>

In [10]:
# Max depth of 50 provides a better score, this is much better than the previous score of around 0.2.
# To improve score suggest increasing max depth even more
grid = {
    "rf__max_depth": [50,70,90],
    "rf__n_estimators": [1, 10, 100, 1000],
    "rf__min_samples_leaf": [1, 3, 5, 7, 10],
    "rf__criterion": ["gini", "entropy"],
}

model = GridSearchCV(pipeline, grid, n_jobs=-1, verbose=2)
model.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 63.9min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed: 321.7min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed: 710.7min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 1295.7min finished


[Pipeline] ..... (step 1 of 2) Processing preprocessing, total=   2.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total= 4.3min


GridSearchCV(estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('scale',
                                                                         StandardScaler(),
                                                                         ['app_id',
                                                                          'device_id',
                                                                          'event_id'])])),
                                       ('rf', RandomForestClassifier())],
                                verbose=2),
             n_jobs=-1,
             param_grid={'rf__criterion': ['gini', 'entropy'],
                         'rf__max_depth': [10, 20, 30, 40, 50],
                         'rf__min_samples_leaf': [1, 3, 5, 7, 10],
                         'rf__n_estimators': [1, 10, 100, 1000]},
             verbose=2)

<IPython.core.display.Javascript object>

In [11]:
model.best_params_

{'rf__criterion': 'gini',
 'rf__max_depth': 50,
 'rf__min_samples_leaf': 1,
 'rf__n_estimators': 100}

<IPython.core.display.Javascript object>

In [12]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.9282236977912337
Test score: 0.4603545952888828


<IPython.core.display.Javascript object>

In [13]:
y_pred = model.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=[
        "Actually M22-",
        "Actually M23-26",
        "Actually M27-28",
        "Actually M29-31",
        "Actually M32-38",
        "Actually M39+",
        "Actually F23-",
        "Actually F24-26",
        "Actually F27-28",
        "Actually F29-32",
        "Actually F33-42",
        "Actually F43+",
    ],
    columns=[
        "Predicted M22-",
        "Predicted M23-26",
        "Predicted M27-28",
        "Predicted M29-31",
        "Predicted M32-38",
        "Predicted M39+",
        "Predicted F23-",
        "Predicted F24-26",
        "Predicted F27-28",
        "Predicted F29-32",
        "Predicted F33-42",
        "Predicted F43+",
    ],
)
con_mat.style.background_gradient(axis=None)

Unnamed: 0,Predicted M22-,Predicted M23-26,Predicted M27-28,Predicted M29-31,Predicted M32-38,Predicted M39+,Predicted F23-,Predicted F24-26,Predicted F27-28,Predicted F29-32,Predicted F33-42,Predicted F43+
Actually M22-,2324,529,180,420,1270,636,108,43,43,57,80,51
Actually M23-26,308,4482,281,811,2373,1270,94,78,67,95,127,94
Actually M27-28,210,515,1965,705,1859,1031,50,36,36,84,151,93
Actually M29-31,207,692,293,5226,2478,1634,33,56,54,85,197,129
Actually M32-38,256,779,333,825,11113,2101,79,47,74,136,273,175
Actually M39+,189,538,211,787,2661,9214,80,46,56,110,302,198
Actually F23-,153,274,74,181,656,278,934,48,9,46,78,39
Actually F24-26,111,218,78,232,706,387,60,563,39,61,71,36
Actually F27-28,98,255,64,230,861,422,38,31,554,51,92,36
Actually F29-32,122,309,141,329,1198,750,62,31,40,1012,117,59


<IPython.core.display.Javascript object>

In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.40      0.47      5741
           1       0.49      0.44      0.46     10080
           2       0.51      0.29      0.37      6735
           3       0.50      0.47      0.48     11084
           4       0.39      0.69      0.50     16191
           5       0.46      0.64      0.53     14392
           6       0.57      0.34      0.42      2770
           7       0.55      0.22      0.31      2562
           8       0.53      0.20      0.29      2732
           9       0.54      0.24      0.33      4170
          10       0.51      0.28      0.36      6059
          11       0.52      0.23      0.32      4682

    accuracy                           0.46     87198
   macro avg       0.51      0.37      0.41     87198
weighted avg       0.48      0.46      0.45     87198



<IPython.core.display.Javascript object>

### Support Vector Classifier

In [None]:
pipeline = Pipeline([("preprocessing", preprocessing), ("SVC", SVC())], verbose=2)

pipeline.fit(X_train, y_train)

[Pipeline] ..... (step 1 of 2) Processing preprocessing, total=   0.8s


In [None]:
grid = {
    "SVC__kernel": ["linear", "rbf", "poly"],
    "SVC__C": [1, 10, 100],
    "SVC__degree": [3, 5, 7],
}

model = GridSearchCV(pipeline, grid, n_jobs=-1,verbose = 2,cv=3)
model.fit(X_train, y_train)

In [None]:
model.best_params_

In [None]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

In [None]:
y_pred = model.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=[
        "Actually M22-",
        "Actually M23-26",
        "Actually M27-28",
        "Actually M29-31",
        "Actually M32-38",
        "Actually M39+",
        "Actually F23-",
        "Actually F24-26",
        "Actually F27-28",
        "Actually F29-32",
        "Actually F33-42",
        "Actually F43+",
    ],
    columns=[
        "Predicted M22-",
        "Predicted M23-26",
        "Predicted M27-28",
        "Predicted M29-31",
        "Predicted M32-38",
        "Predicted M39+",
        "Predicted F23-",
        "Predicted F24-26",
        "Predicted F27-28",
        "Predicted F29-32",
        "Predicted F33-42",
        "Predicted F43+",
    ],
)
con_mat.style.background_gradient(axis=None)

In [None]:
print(classification_report(y_test, y_pred))

# Models with PCA

In [None]:
# Since I am performing pca its necessary to standardize data and fit it to data frame.
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)

# Computing the principal components
pca = PCA()
X_pca = pca.fit_transform(X_train_std)

In [None]:
# I plan to use the amount of components that add up to 90% variance.
n_components = np.sum(np.cumsum(pca.explained_variance_ratio_) < 0.90)
X_pca = X_pca[:, :n_components]

In [None]:
# check to see the amount of components being used
n_components

### KNearestNeighbor Classifier

In [None]:
pipeline = Pipeline(
    [("scaler", StandardScaler()), ("pca", PCA(n_components = n_components)), ("knn", KNeighborsClassifier()),]
)
pipeline = pipeline.fit(X_train, y_train)

In [None]:
grid = {
    "knn__n_neighbors": [3, 5, 7, 9, 11,],
    "knn__weights": ["distance", "uniform"],
}

model = GridSearchCV(pipeline, grid, n_jobs=-1, verbose=2)
model.fit(X_train, y_train)

In [None]:
model.best_params_

In [None]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

In [None]:
y_pred = model.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=[
        "Actually M22-",
        "Actually M23-26",
        "Actually M27-28",
        "Actually M29-31",
        "Actually M32-38",
        "Actually M39+",
        "Actually F23-",
        "Actually F24-26",
        "Actually F27-28",
        "Actually F29-32",
        "Actually F33-42",
        "Actually F43+",
    ],
    columns=[
        "Predicted M22-",
        "Predicted M23-26",
        "Predicted M27-28",
        "Predicted M29-31",
        "Predicted M32-38",
        "Predicted M39+",
        "Predicted F23-",
        "Predicted F24-26",
        "Predicted F27-28",
        "Predicted F29-32",
        "Predicted F33-42",
        "Predicted F43+",
    ],
)
con_mat.style.background_gradient(axis=None)

In [None]:
print(classification_report(y_test, y_pred))

### Random Forest Classifier

In [None]:
pipeline = Pipeline(
    [("scaler", StandardScaler()), ("pca", PCA(n_components = n_components)), ("rf", RandomForestClassifier()),]
)
pipeline = pipeline.fit(X_train, y_train)

In [None]:
grid = {
    "rf__max_depth": [3, 5, 7, 10, 15],
    "rf__n_estimators": [1, 10, 100, 1000],
    "rf__min_samples_leaf": [1, 3, 5, 7, 10],
    "rf__criterion": ["gini", "entropy"],
}

model = GridSearchCV(pipeline, grid, n_jobs=-1,verbose = 2)
model.fit(X_train, y_train)

In [None]:
model.best_params_

In [None]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

In [None]:
y_pred = model.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=[
        "Actually M22-",
        "Actually M23-26",
        "Actually M27-28",
        "Actually M29-31",
        "Actually M32-38",
        "Actually M39+",
        "Actually F23-",
        "Actually F24-26",
        "Actually F27-28",
        "Actually F29-32",
        "Actually F33-42",
        "Actually F43+",
    ],
    columns=[
        "Predicted M22-",
        "Predicted M23-26",
        "Predicted M27-28",
        "Predicted M29-31",
        "Predicted M32-38",
        "Predicted M39+",
        "Predicted F23-",
        "Predicted F24-26",
        "Predicted F27-28",
        "Predicted F29-32",
        "Predicted F33-42",
        "Predicted F43+",
    ],
)
con_mat.style.background_gradient(axis=None)

In [None]:
print(classification_report(y_test, y_pred))

### Support Vector Classifier

In [None]:
pipeline = Pipeline(
    [("scaler", StandardScaler()), ("pca", PCA(n_components = n_components)), ("SVC", SVC()),]
)
pipeline = pipeline.fit(X_train, y_train)

In [None]:
grid = {
    "SVC__kernel": ["linear", "rbf", "poly"],
    "SVC__C": [1, 10, 100, 1000],
    "SVC__degree": [3, 5, 7, 10],
}

model = GridSearchCV(pipeline, grid, n_jobs=-1,verbose = 2)
model.fit(X_train, y_train)

In [None]:
model.best_params_

In [None]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

In [None]:
y_pred = model.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=[
        "Actually M22-",
        "Actually M23-26",
        "Actually M27-28",
        "Actually M29-31",
        "Actually M32-38",
        "Actually M39+",
        "Actually F23-",
        "Actually F24-26",
        "Actually F27-28",
        "Actually F29-32",
        "Actually F33-42",
        "Actually F43+",
    ],
    columns=[
        "Predicted M22-",
        "Predicted M23-26",
        "Predicted M27-28",
        "Predicted M29-31",
        "Predicted M32-38",
        "Predicted M39+",
        "Predicted F23-",
        "Predicted F24-26",
        "Predicted F27-28",
        "Predicted F29-32",
        "Predicted F33-42",
        "Predicted F43+",
    ],
)
con_mat.style.background_gradient(axis=None)

In [None]:
print(classification_report(y_test, y_pred))

# Models with SelectKBest

### KNearestNeighbor Classifier