# Feature Selection Analysis

In [183]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,accuracy_score, f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from matplotlib.colors import ListedColormap
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import ConfusionMatrixDisplay

In [184]:
### Import data sources ###

# Training data
train = pd.read_csv('../data/train.csv')

#Previously purchased subscriptions by account
subscriptions = pd.read_csv('../data/subscriptions.csv')
#display(subscriptions.head())

# Location info for each patron and donation history
accounts = pd.read_csv('../data/account2.csv')
#display(accounts.head())

# Previous concerts by season
concerts = pd.read_csv('../data/concerts.csv')
#display(concerts.head())

# List of planned concert sets for the 2014-15 season
planned_concerts = pd.read_csv('../data/concerts_2014-15.csv')
#display(planned_concerts.head())

# Previously purchased tickets by account
tickets = pd.read_csv('../data/tickets_all.csv')
#display(tickets.head())

# Location and demographic information for zipcodes
zipcodes = pd.read_csv('../data/zipcodes.csv')
#display(zipcodes.head())

In [185]:
### Subscriptions Analysis ###

def add_subscriptions_feature(df, subscriptions):
    subscriptions_by_account = pd.DataFrame({'num_subscriptions':subscriptions.groupby(['account.id']).size()}).reset_index()
    #display(subscriptions_by_account.head())
    # Add total subscriptions by account id (if can't find, then 0)
    df['num_subscriptions'] = df['account.id'].map(subscriptions_by_account.set_index('account.id')['num_subscriptions'])
    # Assuming that if account doesn't appear on subscriptions data, then they have no subscriptions
    df['num_subscriptions'] = df['num_subscriptions'].fillna(0)
    #display(df.head())

    return df

def add_subscriptions_feature_to_test(df, subscriptions):
    subscriptions_by_account = pd.DataFrame({'num_subscriptions':subscriptions.groupby(['account.id']).size()}).reset_index()
    #display(subscriptions_by_account.head())
    # Add total subscriptions by account id (if can't find, then 0)
    df['num_subscriptions'] = df['ID'].map(subscriptions_by_account.set_index('account.id')['num_subscriptions'])
    # Assuming that if account doesn't appear on subscriptions data, then they have no subscriptions
    df['num_subscriptions'] = df['num_subscriptions'].fillna(0)
    #display(df.head())

    return df

In [186]:
def plot_decision_boundaries(X,y,model):
    """
    Plots the 2D decision boundary of a classification model
    Parameters:
    X (pandas dataframe): input features
    y (pandas series): target values
    model: trained scikit-learn model object
    """
    markers = ['^','s','v','o','x']
    colors = ['yellow','green','purple','blue','orange']
    cmap = ListedColormap(colors[:len(np.unique(y))])
    
    for i,k in enumerate(np.unique(y)):
        plt.scatter(X.loc[y.values==k].iloc[:,0],X.loc[y.values==k].iloc[:,1],
                    c=colors[i],marker=markers[i],label=k,edgecolor='black')

    xgrid = np.arange(X.iloc[:,0].min(),X.iloc[:,0].max(),
                      (X.iloc[:,0].max()-X.iloc[:,0].min())/500)
    ygrid = np.arange(X.iloc[:,1].min(),X.iloc[:,1].max(),
                      (X.iloc[:,1].max()-X.iloc[:,1].min())/500)
    xx,yy = np.meshgrid(xgrid,ygrid)
    
    mesh_preds = model.predict(np.c_[xx.ravel(),yy.ravel()])
    mesh_preds = mesh_preds.reshape(xx.shape)
    plt.contourf(xx,yy,mesh_preds,alpha=0.2,cmap=cmap)
    plt.legend()
    
    return

In [187]:
def get_best_knn(X_train_scaled, y_train):
    best_acc = 0
    best_model = None
    optimal_n_neighbors = None

    kf = KFold(n_splits=5)

    for n_neighbors in [1,2,5,10]:
        model = KNeighborsClassifier(n_neighbors=n_neighbors)
        accs = []
        for train_indexes,test_indexes in kf.split(X_train_scaled):
            X_train_fold,X_test_fold = X_train_scaled.iloc[train_indexes],X_train_scaled.iloc[test_indexes]
            y_train_fold,y_test_fold = y_train.iloc[train_indexes],y_train.iloc[test_indexes]
            model.fit(X_train_fold,y_train_fold)
            accs.append(accuracy_score(y_test_fold,model.predict(X_test_fold)))

        if np.mean(accs) > best_acc:
            best_acc = np.mean(accs)
            optimal_n_neighbors = n_neighbors
            best_model = model

    print("Found best KNN with optimal neighbors:",optimal_n_neighbors)

    return best_model

In [188]:
train = add_subscriptions_feature(train, subscriptions)

X = train[['num_subscriptions']]
y = train['label']
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=0,test_size=0.2)

# Let's scale the inputs to help it converge more easily
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train),columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test),columns=X_train.columns)

kf = KFold(n_splits=10)
best_knn = get_best_knn(X_train_scaled,y_train)
models = [best_knn, SVC(kernel='linear', C=1), SVC(kernel='rbf', C=1)]
model_names = ['KNN', 'Linear SVC', 'RBF SVC']
model_aurocs = []

for index, model in enumerate(models):
    aurocs = []
    for train_indexes, test_indexes in kf.split(X_train_scaled):
        X_train_fold, X_test_fold = X_train_scaled.iloc[train_indexes], X_train_scaled.iloc[test_indexes]
        y_train_fold, y_test_fold = y_train.iloc[train_indexes], y_train.iloc[test_indexes]
        model.fit(X_train_fold, y_train_fold)

        aurocs.append(roc_auc_score(y_test_fold, model.predict(X_test_fold)))
    mean_auroc = np.mean(aurocs)
    print(model_names[index], " model AUROC: ", mean_auroc)
    model_aurocs.append(mean_auroc)
  
    #plot_decision_boundaries(X_train_scaled, y_train, model)

best_model_index = np.argmax(model_aurocs)
print("Best performance model is ", model_names[best_model_index])

best_model = models[best_model_index]
preds = best_model.predict(X_test_scaled)
auroc_score = roc_auc_score(y_test,preds)

print("Auroc Score:",auroc_score)

test = pd.read_csv('../data/test.csv')
test = add_subscriptions_feature_to_test(test, subscriptions)
X_test = test[['num_subscriptions']]
test['Predicted'] = best_model.predict(X_test)
test = test.drop(columns=['num_subscriptions'])
test.to_csv('../data/test_predictions.csv', index=False)


Found best KNN with optimal neighbors: 10
KNN  model AUROC:  0.6601682348914424
Linear SVC  model AUROC:  0.7132155995843643
RBF SVC  model AUROC:  0.6843375183165132
Best performance model is  Linear SVC
Auroc Score: 0.7522847522847523
