In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# creating supervised learning imports

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
)

<IPython.core.display.Javascript object>

In [3]:
# Due to time constraints and such a large data set, I subsampled my data to work with a smaller set for preprocessing
final_df = pd.read_csv("./cleaned_data.csv")
final_df = final_df.sample(100000, random_state=13)
final_df.head()

Unnamed: 0,event_id,app_id,is_active,device_id,group,8,9,10,11,12,...,device_model_魅蓝,device_model_魅蓝2,device_model_魅蓝NOTE,device_model_魅蓝Note 2,device_model_魅蓝metal,device_model_麦芒3,device_model_麦芒3S,device_model_麦芒4,device_model_黄金斗士A8,device_model_黄金斗士Note8
358014,671542,-6172775651801283024,0,-5591315370762637500,M22-,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
254503,273588,1883678791934985414,1,6414313316267266281,M23-26,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91844,54515,-6590473556670600053,0,4189762977638537239,F29-32,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
411452,1438557,-2320783822570582843,0,8268943186752324161,M32-38,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
293071,375667,4373268368372483132,0,4917614238706511620,M29-31,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

# Additional EDA

In [4]:
# Do not think event_id, app_id are necessary
final_df = final_df.drop(columns=["app_id", "event_id"])

<IPython.core.display.Javascript object>

In [5]:
# Need to convert these values to numeric values
# Considering:
# M22- : 0
# M23-26 : 1
# M27-28 : 2
# M29-31 : 3
# M32-38 : 4
# M39+ : 5
# F23- : 6
# F24-26 : 7
# F27-28 : 8
# F29-32 : 9
# F33-42 : 10
# F43+ : 11
final_df["group"].value_counts()

M32-38    18475
M39+      16530
M29-31    12766
M23-26    11633
M27-28     7644
F33-42     7031
M22-       6669
F43+       5263
F29-32     4755
F27-28     3138
F23-       3086
F24-26     3010
Name: group, dtype: int64

<IPython.core.display.Javascript object>

In [6]:
# Converting object data to numeric data in group column
to_num = {
    "M22-": 0,
    "M23-26": 1,
    "M27-28": 2,
    "M29-31": 3,
    "M32-38": 4,
    "M39+": 5,
    "F23-": 6,
    "F24-26": 7,
    "F27-28": 8,
    "F29-32": 9,
    "F33-42": 10,
    "F43+": 11,
}

# Created new column for numeric group, can now drop old group column
final_df["num_group"] = final_df["group"].map(to_num)
final_df = final_df.drop(columns=["group"])
final_df.head()

Unnamed: 0,is_active,device_id,8,9,10,11,12,13,16,17,...,device_model_魅蓝2,device_model_魅蓝NOTE,device_model_魅蓝Note 2,device_model_魅蓝metal,device_model_麦芒3,device_model_麦芒3S,device_model_麦芒4,device_model_黄金斗士A8,device_model_黄金斗士Note8,num_group
358014,0,-5591315370762637500,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
254503,1,6414313316267266281,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
91844,0,4189762977638537239,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
411452,0,8268943186752324161,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
293071,0,4917614238706511620,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


<IPython.core.display.Javascript object>

In [7]:
num_cols = ["device_id"]

<IPython.core.display.Javascript object>

In [8]:
# splitting up the data and choosing num_group as target variable
X = final_df.drop(columns=["num_group"])
y = final_df["num_group"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=13, stratify=y
)

<IPython.core.display.Javascript object>

In [9]:
# set up preprocessing for pipeline (used to scale features)
preprocessing = ColumnTransformer(
    [("scale", StandardScaler(), num_cols),], remainder="passthrough",
)

<IPython.core.display.Javascript object>

# Original Models

### KNearestNeighbor Classifier

In [10]:
pipeline = Pipeline(
    [("preprocessing", preprocessing), ("knn", KNeighborsClassifier())], verbose=True
)

pipeline.fit(X_train, y_train)

[Pipeline] ..... (step 1 of 2) Processing preprocessing, total=   0.2s
[Pipeline] ............... (step 2 of 2) Processing knn, total=  22.3s


Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('scale', StandardScaler(),
                                                  ['device_id'])])),
                ('knn', KNeighborsClassifier())],
         verbose=True)

<IPython.core.display.Javascript object>

In [11]:
# Due to shortage of time I decreased the amount of parameters and lowered the cv
grid = {
    "knn__n_neighbors": [100, 130, 150],
    "knn__weights": ["distance"],
    "knn__leaf_size": [30, 50, 70],
}

model = GridSearchCV(pipeline, grid, n_jobs=-1, verbose=2, cv=2)
model.fit(X_train, y_train)

Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  18 | elapsed: 50.9min remaining: 132.4min
[Parallel(n_jobs=-1)]: Done  15 out of  18 | elapsed: 80.3min remaining: 16.1min
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed: 81.1min finished


[Pipeline] ..... (step 1 of 2) Processing preprocessing, total=   0.3s
[Pipeline] ............... (step 2 of 2) Processing knn, total=  22.4s


GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('scale',
                                                                         StandardScaler(),
                                                                         ['device_id'])])),
                                       ('knn', KNeighborsClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'knn__leaf_size': [30, 50, 70],
                         'knn__n_neighbors': [100, 130, 150],
                         'knn__weights': ['distance']},
             verbose=2)

<IPython.core.display.Javascript object>

In [12]:
# It seems like the higher the amount of n_neighbors the better the score, for next test, will increase n_neighbors
model.best_params_

{'knn__leaf_size': 30, 'knn__n_neighbors': 150, 'knn__weights': 'distance'}

<IPython.core.display.Javascript object>

In [13]:
# Model still appears to be extremely overfitting
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 1.0
Test score: 0.2477


<IPython.core.display.Javascript object>

In [14]:
y_pred = model.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=[
        "Actually M22-",
        "Actually M23-26",
        "Actually M27-28",
        "Actually M29-31",
        "Actually M32-38",
        "Actually M39+",
        "Actually F23-",
        "Actually F24-26",
        "Actually F27-28",
        "Actually F29-32",
        "Actually F33-42",
        "Actually F43+",
    ],
    columns=[
        "Predicted M22-",
        "Predicted M23-26",
        "Predicted M27-28",
        "Predicted M29-31",
        "Predicted M32-38",
        "Predicted M39+",
        "Predicted F23-",
        "Predicted F24-26",
        "Predicted F27-28",
        "Predicted F29-32",
        "Predicted F33-42",
        "Predicted F43+",
    ],
)
con_mat.style.background_gradient(axis=None)

Unnamed: 0,Predicted M22-,Predicted M23-26,Predicted M27-28,Predicted M29-31,Predicted M32-38,Predicted M39+,Predicted F23-,Predicted F24-26,Predicted F27-28,Predicted F29-32,Predicted F33-42,Predicted F43+
Actually M22-,189,207,34,120,452,240,35,11,5,10,16,15
Actually M23-26,106,465,56,208,907,447,28,12,15,19,34,30
Actually M27-28,41,193,124,168,595,322,18,8,7,12,24,17
Actually M29-31,73,241,62,441,1003,598,25,13,13,24,34,26
Actually M32-38,81,301,58,262,1879,926,37,15,19,26,55,36
Actually M39+,56,177,56,224,1173,1460,20,12,14,24,57,33
Actually F23-,30,88,15,47,225,116,67,6,3,4,9,7
Actually F24-26,29,61,19,46,227,141,19,27,9,6,10,8
Actually F27-28,28,73,11,52,248,141,8,7,39,6,10,4
Actually F29-32,24,94,25,83,372,255,18,5,1,48,17,9


<IPython.core.display.Javascript object>

In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.27      0.14      0.19      1334
           1       0.22      0.20      0.21      2327
           2       0.25      0.08      0.12      1529
           3       0.24      0.17      0.20      2553
           4       0.24      0.51      0.32      3695
           5       0.27      0.44      0.33      3306
           6       0.22      0.11      0.15       617
           7       0.21      0.04      0.07       602
           8       0.27      0.06      0.10       627
           9       0.24      0.05      0.08       951
          10       0.31      0.09      0.14      1406
          11       0.31      0.09      0.14      1053

    accuracy                           0.25     20000
   macro avg       0.25      0.17      0.17     20000
weighted avg       0.25      0.25      0.22     20000



<IPython.core.display.Javascript object>

### Random Forest Classifier

In [16]:
pipeline = Pipeline(
    [("preprocessing", preprocessing), ("rf", RandomForestClassifier())], verbose=True
)

pipeline.fit(X_train, y_train)

[Pipeline] ..... (step 1 of 2) Processing preprocessing, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total= 1.3min


Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('scale', StandardScaler(),
                                                  ['device_id'])])),
                ('rf', RandomForestClassifier())],
         verbose=True)

<IPython.core.display.Javascript object>

In [17]:
# Max depth of 50 provides a better score, this is much better than the previous score of around 0.2.
# To improve score suggest increasing max depth even more
grid = {
    "rf__max_depth": [50, 70, 90],
    "rf__n_estimators": [1, 10, 100],
    "rf__min_samples_leaf": [1, 3, 5, 7],
    "rf__criterion": ["gini"],
}

model = GridSearchCV(pipeline, grid, n_jobs=-1, cv=2, verbose=True)
model.fit(X_train, y_train)

Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   44.7s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  2.2min finished


[Pipeline] ..... (step 1 of 2) Processing preprocessing, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  38.0s


GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('scale',
                                                                         StandardScaler(),
                                                                         ['device_id'])])),
                                       ('rf', RandomForestClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'rf__criterion': ['gini'],
                         'rf__max_depth': [50, 70, 90],
                         'rf__min_samples_leaf': [1, 3, 5, 7],
                         'rf__n_estimators': [1, 10, 100]},
             verbose=True)

<IPython.core.display.Javascript object>

In [18]:
model.best_params_

{'rf__criterion': 'gini',
 'rf__max_depth': 90,
 'rf__min_samples_leaf': 3,
 'rf__n_estimators': 100}

<IPython.core.display.Javascript object>

In [19]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.415025
Test score: 0.28685


<IPython.core.display.Javascript object>

In [20]:
y_pred = model.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=[
        "Actually M22-",
        "Actually M23-26",
        "Actually M27-28",
        "Actually M29-31",
        "Actually M32-38",
        "Actually M39+",
        "Actually F23-",
        "Actually F24-26",
        "Actually F27-28",
        "Actually F29-32",
        "Actually F33-42",
        "Actually F43+",
    ],
    columns=[
        "Predicted M22-",
        "Predicted M23-26",
        "Predicted M27-28",
        "Predicted M29-31",
        "Predicted M32-38",
        "Predicted M39+",
        "Predicted F23-",
        "Predicted F24-26",
        "Predicted F27-28",
        "Predicted F29-32",
        "Predicted F33-42",
        "Predicted F43+",
    ],
)
con_mat.style.background_gradient(axis=None)

Unnamed: 0,Predicted M22-,Predicted M23-26,Predicted M27-28,Predicted M29-31,Predicted M32-38,Predicted M39+,Predicted F23-,Predicted F24-26,Predicted F27-28,Predicted F29-32,Predicted F33-42,Predicted F43+
Actually M22-,244,242,10,102,452,230,36,2,0,4,6,6
Actually M23-26,105,541,22,180,947,472,22,8,1,13,10,6
Actually M27-28,69,173,69,154,660,366,19,2,2,5,6,4
Actually M29-31,67,213,23,526,1003,678,14,0,2,2,20,5
Actually M32-38,68,224,32,198,2234,879,28,1,4,6,15,6
Actually M39+,47,148,19,147,1071,1821,20,0,1,8,19,5
Actually F23-,41,109,7,38,223,96,85,5,1,2,10,0
Actually F24-26,42,70,12,30,239,141,28,29,0,4,7,0
Actually F27-28,33,73,9,48,255,149,16,5,21,4,9,5
Actually F29-32,24,72,11,68,441,265,13,2,2,38,12,3


<IPython.core.display.Javascript object>

In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.30      0.18      0.23      1334
           1       0.27      0.23      0.25      2327
           2       0.29      0.05      0.08      1529
           3       0.32      0.21      0.25      2553
           4       0.26      0.60      0.37      3695
           5       0.30      0.55      0.39      3306
           6       0.26      0.14      0.18       617
           7       0.51      0.05      0.09       602
           8       0.60      0.03      0.06       627
           9       0.39      0.04      0.07       951
          10       0.41      0.06      0.11      1406
          11       0.47      0.04      0.07      1053

    accuracy                           0.29     20000
   macro avg       0.37      0.18      0.18     20000
weighted avg       0.33      0.29      0.24     20000



<IPython.core.display.Javascript object>

# Models with PCA

In [None]:
# Since I am performing pca its necessary to standardize data and fit it to data frame.
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)

# Computing the principal components
pca = PCA()
X_pca = pca.fit_transform(X_train_std)

In [None]:
# I plan to use the amount of components that add up to 90% variance.
n_components = np.sum(np.cumsum(pca.explained_variance_ratio_) < 0.90)
X_pca = X_pca[:, :n_components]

In [None]:
# check to see the amount of components being used
n_components

### KNearestNeighbor Classifier

In [None]:
pipeline = Pipeline(
    [("scaler", StandardScaler()), ("pca", PCA(n_components = n_components)), ("knn", KNeighborsClassifier()),]
)
pipeline = pipeline.fit(X_train, y_train)

In [None]:
grid = {
    "knn__n_neighbors": [3, 5, 7, 9, 11,],
    "knn__weights": ["distance", "uniform"],
}

model = GridSearchCV(pipeline, grid, n_jobs=-1, verbose=2)
model.fit(X_train, y_train)

In [None]:
model.best_params_

In [None]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

In [None]:
y_pred = model.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=[
        "Actually M22-",
        "Actually M23-26",
        "Actually M27-28",
        "Actually M29-31",
        "Actually M32-38",
        "Actually M39+",
        "Actually F23-",
        "Actually F24-26",
        "Actually F27-28",
        "Actually F29-32",
        "Actually F33-42",
        "Actually F43+",
    ],
    columns=[
        "Predicted M22-",
        "Predicted M23-26",
        "Predicted M27-28",
        "Predicted M29-31",
        "Predicted M32-38",
        "Predicted M39+",
        "Predicted F23-",
        "Predicted F24-26",
        "Predicted F27-28",
        "Predicted F29-32",
        "Predicted F33-42",
        "Predicted F43+",
    ],
)
con_mat.style.background_gradient(axis=None)

In [None]:
print(classification_report(y_test, y_pred))

### Random Forest Classifier

In [None]:
pipeline = Pipeline(
    [("scaler", StandardScaler()), ("pca", PCA(n_components = n_components)), ("rf", RandomForestClassifier()),]
)
pipeline = pipeline.fit(X_train, y_train)

In [None]:
grid = {
    "rf__max_depth": [3, 5, 7, 10, 15],
    "rf__n_estimators": [1, 10, 100, 1000],
    "rf__min_samples_leaf": [1, 3, 5, 7, 10],
    "rf__criterion": ["gini", "entropy"],
}

model = GridSearchCV(pipeline, grid, n_jobs=-1,verbose = 2)
model.fit(X_train, y_train)

In [None]:
model.best_params_

In [None]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

In [None]:
y_pred = model.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=[
        "Actually M22-",
        "Actually M23-26",
        "Actually M27-28",
        "Actually M29-31",
        "Actually M32-38",
        "Actually M39+",
        "Actually F23-",
        "Actually F24-26",
        "Actually F27-28",
        "Actually F29-32",
        "Actually F33-42",
        "Actually F43+",
    ],
    columns=[
        "Predicted M22-",
        "Predicted M23-26",
        "Predicted M27-28",
        "Predicted M29-31",
        "Predicted M32-38",
        "Predicted M39+",
        "Predicted F23-",
        "Predicted F24-26",
        "Predicted F27-28",
        "Predicted F29-32",
        "Predicted F33-42",
        "Predicted F43+",
    ],
)
con_mat.style.background_gradient(axis=None)

In [None]:
print(classification_report(y_test, y_pred))

### Support Vector Classifier

In [None]:
pipeline = Pipeline(
    [("scaler", StandardScaler()), ("pca", PCA(n_components = n_components)), ("SVC", SVC()),]
)
pipeline = pipeline.fit(X_train, y_train)

In [None]:
grid = {
    "SVC__kernel": ["linear", "rbf", "poly"],
    "SVC__C": [1, 10, 100, 1000],
    "SVC__degree": [3, 5, 7, 10],
}

model = GridSearchCV(pipeline, grid, n_jobs=-1,verbose = 2)
model.fit(X_train, y_train)

In [None]:
model.best_params_

In [None]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

In [None]:
y_pred = model.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=[
        "Actually M22-",
        "Actually M23-26",
        "Actually M27-28",
        "Actually M29-31",
        "Actually M32-38",
        "Actually M39+",
        "Actually F23-",
        "Actually F24-26",
        "Actually F27-28",
        "Actually F29-32",
        "Actually F33-42",
        "Actually F43+",
    ],
    columns=[
        "Predicted M22-",
        "Predicted M23-26",
        "Predicted M27-28",
        "Predicted M29-31",
        "Predicted M32-38",
        "Predicted M39+",
        "Predicted F23-",
        "Predicted F24-26",
        "Predicted F27-28",
        "Predicted F29-32",
        "Predicted F33-42",
        "Predicted F43+",
    ],
)
con_mat.style.background_gradient(axis=None)

In [None]:
print(classification_report(y_test, y_pred))

# Models with SelectKBest

### KNearestNeighbor Classifier