# Feature Selection Analysis

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,accuracy_score, f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from matplotlib.colors import ListedColormap
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron

from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
### Import data sources ###

# Training data
train = pd.read_csv('../data/train.csv')

#Previously purchased subscriptions by account
subscriptions = pd.read_csv('../data/subscriptions.csv')
#display(subscriptions.head())

# Location info for each patron and donation history
accounts = pd.read_csv('../data/account2.csv')
#display(accounts.head())

# Previous concerts by season
concerts = pd.read_csv('../data/concerts.csv')
#display(concerts.head())

# List of planned concert sets for the 2014-15 season
planned_concerts = pd.read_csv('../data/concerts_2014-15.csv')
#display(planned_concerts.head())

# Previously purchased tickets by account
tickets = pd.read_csv('../data/tickets_all.csv')
#display(tickets.head())

# Location and demographic information for zipcodes
zipcodes = pd.read_csv('../data/zipcodes.csv')
#display(zipcodes.head())

In [None]:
### Subscriptions Analysis ###

def add_subscriptions_feature(df, subscriptions):
    subscriptions_by_account = pd.DataFrame({'num_subscriptions':subscriptions.groupby(['account.id']).size()}).reset_index()
    #display(subscriptions_by_account.head())
    # Add total subscriptions by account id (if can't find, then 0)
    df['num_subscriptions'] = df['account.id'].map(subscriptions_by_account.set_index('account.id')['num_subscriptions'])
    # Assuming that if account doesn't appear on subscriptions data, then they have no subscriptions
    df['num_subscriptions'] = df['num_subscriptions'].fillna(0)
    #display(df.head())
    return df

def add_subscriptions_feature_to_test(df, subscriptions):
    subscriptions_by_account = pd.DataFrame({'num_subscriptions':subscriptions.groupby(['account.id']).size()}).reset_index()
    #display(subscriptions_by_account.head())
    # Add total subscriptions by account id (if can't find, then 0)
    df['num_subscriptions'] = df['ID'].map(subscriptions_by_account.set_index('account.id')['num_subscriptions'])
    # Assuming that if account doesn't appear on subscriptions data, then they have no subscriptions
    df['num_subscriptions'] = df['num_subscriptions'].fillna(0)
    #display(df.head())

    return df

In [None]:
### Conductor Analysis ###

def add_conductor_feature(df, subscriptions, concerts, planned_concerts):
    subs_by_acc = subscriptions.groupby(['account.id'])

    subscriptions_by_account = pd.DataFrame({'num_subscriptions':subs_by_acc.size(), 
                                            'sub_tier': subs_by_acc['subscription_tier'].apply(lambda x: x.mode().iloc[0]),
                                            'sub_seasons': subs_by_acc['season'].unique()
                                            }).reset_index()
    
    # group concerts by season and aggregate list of unique conductors
    concerts['conductor'] = concerts['who'].apply(lambda x: x.split(',')[0])
    conductors_by_season = concerts.groupby(['season'])['conductor'].unique().reset_index()
    
    # create new column conductors in subscriptions_by_account with all the unique values as a flattened list from conductors_by_season where the season is one of the sub_seasons
    subscriptions_by_account['conductors'] = subscriptions_by_account['sub_seasons'].apply(lambda x: set([item for sublist in conductors_by_season[conductors_by_season['season'].isin(x)]['conductor'] for item in sublist]))
    
    # transform "who" column in planned_concerts to "conductors" column with just the name of the conductors
    planned_concerts['conductors'] = planned_concerts['who'].apply(lambda x: x.split(',')[0])
    #display(planned_concerts.head())

    # aggregate list of unique conductors in next season
    planned_conductors = planned_concerts['conductors'].unique()

    subscriptions_by_account['watched_conductors'] = subscriptions_by_account['conductors'].apply(lambda x: len(x.intersection(planned_conductors)))

    #display(planned_conductors)
    #display(conductors_by_season)           
    #display(subscriptions_by_account.iloc[0]['conductors'])
    #display(subscriptions_by_account['watched_conductors'].value_counts())
    #display(subscriptions_by_account[subscriptions_by_account['conductors'].apply(lambda x: len(x)) == 8])

    subscriptions_by_account.drop(['sub_seasons', 'conductors'], axis=1, inplace=True)

    df['watched_conductors'] = df['account.id'].map(subscriptions_by_account.set_index('account.id')['watched_conductors'])
    df['watched_conductors'] = df['watched_conductors'].fillna(0)

    return df

def add_conductor_feature_to_test(df, subscriptions, concerts, planned_concerts):
    subs_by_acc = subscriptions.groupby(['account.id'])

    subscriptions_by_account = pd.DataFrame({'num_subscriptions':subs_by_acc.size(), 
                                            'sub_tier': subs_by_acc['subscription_tier'].apply(lambda x: x.mode().iloc[0]),
                                            'sub_seasons': subs_by_acc['season'].unique()
                                            }).reset_index()
    
    # group concerts by season and aggregate list of unique conductors
    concerts['conductor'] = concerts['who'].apply(lambda x: x.split(',')[0])
    conductors_by_season = concerts.groupby(['season'])['conductor'].unique().reset_index()
    
    # create new column conductors in subscriptions_by_account with all the unique values as a flattened list from conductors_by_season where the season is one of the sub_seasons
    subscriptions_by_account['conductors'] = subscriptions_by_account['sub_seasons'].apply(lambda x: set([item for sublist in conductors_by_season[conductors_by_season['season'].isin(x)]['conductor'] for item in sublist]))
    
    # transform "who" column in planned_concerts to "conductors" column with just the name of the conductors
    planned_concerts['conductors'] = planned_concerts['who'].apply(lambda x: x.split(',')[0])
    #display(planned_concerts.head())

    # aggregate list of unique conductors in next season
    planned_conductors = planned_concerts['conductors'].unique()

    subscriptions_by_account['watched_conductors'] = subscriptions_by_account['conductors'].apply(lambda x: len(x.intersection(planned_conductors)))

    #display(planned_conductors)
    #display(conductors_by_season)           
    #display(subscriptions_by_account.iloc[0]['conductors'])
    #display(subscriptions_by_account['watched_conductors'].value_counts())
    #display(subscriptions_by_account[subscriptions_by_account['conductors'].apply(lambda x: len(x)) == 8])

    subscriptions_by_account.drop(['sub_seasons', 'conductors'], axis=1, inplace=True)

    df['watched_conductors'] = df['ID'].map(subscriptions_by_account.set_index('account.id')['watched_conductors'])
    df['watched_conductors'] = df['watched_conductors'].fillna(0)

    return df

In [None]:
def add_account_features(df, accounts):
    # Add account features
    df = pd.merge(df, accounts, left_on='account.id', right_on='account.id', how='left')
    df.drop(['account.id'], axis=1, inplace=True)
    display(df.head())
    return df

add_account_features(train, accounts)

Unnamed: 0,label,shipping.zip.code,billing.zip.code,shipping.city,billing.city,relationship,amount.donated.2013,amount.donated.lifetime,no.donations.lifetime,first.donated
0,0,,,,,,0.0,0.0,0,
1,0,,,,,,0.0,0.0,0,
2,0,,94597.0,,Walnut Creek,,0.0,0.0,0,
3,0,,94005.0,,Brisbane,,0.0,0.0,0,
4,0,,94610.0,,Oakland,,0.0,0.0,0,


Unnamed: 0,label,shipping.zip.code,billing.zip.code,shipping.city,billing.city,relationship,amount.donated.2013,amount.donated.lifetime,no.donations.lifetime,first.donated
0,0,,,,,,0.0,0.0,0,
1,0,,,,,,0.0,0.0,0,
2,0,,94597,,Walnut Creek,,0.0,0.0,0,
3,0,,94005,,Brisbane,,0.0,0.0,0,
4,0,,94610,,Oakland,,0.0,0.0,0,
...,...,...,...,...,...,...,...,...,...,...
6936,0,,94306,,Palo Alto,,0.0,0.0,0,
6937,0,,94118,,San Francisco,,0.0,0.0,0,
6938,0,,21012,,Arnold,,0.0,18000.0,6,12/31/02 0:00
6939,0,,94117,,San Francisco,,0.0,246.0,8,7/4/92 0:00


In [None]:
def add_features(df, concerts, planned_concerts, subscriptions):
    df = add_subscriptions_feature(df, subscriptions)
    df = add_conductor_feature(df, subscriptions, concerts, planned_concerts)
    
    #df = add_account_features(df, accounts)
    display(df.head())
    return df    

In [None]:
def plot_decision_boundaries(X,y,model):
    """
    Plots the 2D decision boundary of a classification model
    Parameters:
    X (pandas dataframe): input features
    y (pandas series): target values
    model: trained scikit-learn model object
    """
    markers = ['^','s','v','o','x']
    colors = ['yellow','green','purple','blue','orange']
    cmap = ListedColormap(colors[:len(np.unique(y))])
    
    for i,k in enumerate(np.unique(y)):
        plt.scatter(X.loc[y.values==k].iloc[:,0],X.loc[y.values==k].iloc[:,1],
                    c=colors[i],marker=markers[i],label=k,edgecolor='black')

    xgrid = np.arange(X.iloc[:,0].min(),X.iloc[:,0].max(),
                      (X.iloc[:,0].max()-X.iloc[:,0].min())/500)
    ygrid = np.arange(X.iloc[:,1].min(),X.iloc[:,1].max(),
                      (X.iloc[:,1].max()-X.iloc[:,1].min())/500)
    xx,yy = np.meshgrid(xgrid,ygrid)
    
    mesh_preds = model.predict(np.c_[xx.ravel(),yy.ravel()])
    mesh_preds = mesh_preds.reshape(xx.shape)
    plt.contourf(xx,yy,mesh_preds,alpha=0.2,cmap=cmap)
    plt.legend()
    
    return

In [None]:
def get_best_knn(X_train, y_train):
    best_acc = 0
    best_model = None
    optimal_n_neighbors = None

    kf = KFold(n_splits=5)

    for n_neighbors in [1,2,5,10]:
        model = KNeighborsClassifier(n_neighbors=n_neighbors)
        accs = []
        for train_indexes, val_indexes in kf.split(X_train):
            X_train_fold,X_val_fold = X_train.iloc[train_indexes],X_train.iloc[val_indexes]
            y_train_fold,y_val_fold = y_train.iloc[train_indexes],y_train.iloc[val_indexes]
            X_train_scaled_fold = scaler.fit_transform(X_train_fold)
            X_val_scaled_fold = scaler.transform(X_val_fold)
            model.fit(X_train_scaled_fold,y_train_fold)
            accs.append(accuracy_score(y_val_fold,model.predict(X_val_scaled_fold)))

        if np.mean(accs) > best_acc:
            best_acc = np.mean(accs)
            optimal_n_neighbors = n_neighbors
            best_model = model

    print("Found best KNN with optimal neighbors:",optimal_n_neighbors)

    return best_model

In [None]:
train = add_features(train, concerts, planned_concerts, subscriptions)

X = train.drop(columns=['label', 'account.id'])
y = train['label']
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=0,test_size=0.2)

# Let's scale the inputs to help it converge more easily
scaler = StandardScaler()

kf = KFold(n_splits=10)
best_knn = get_best_knn(X_train,y_train)
models = [best_knn, SVC(kernel='linear', C=1), SVC(kernel='rbf', C=1), Perceptron()]
model_names = ['KNN', 'Linear SVC', 'RBF SVC', 'Perceptron']
model_aurocs = []

for index, model in enumerate(models):
    aurocs = []
    for train_indexes, val_indexes in kf.split(X_train):
        X_train_fold, X_val_fold = X_train.iloc[train_indexes], X_train.iloc[val_indexes]
        y_train_fold, y_val_fold = y_train.iloc[train_indexes], y_train.iloc[val_indexes]

        X_train_scaled_fold = scaler.fit_transform(X_train_fold)
        X_val_scaled_fold = scaler.transform(X_val_fold)
        model.fit(X_train_scaled_fold, y_train_fold)

        aurocs.append(roc_auc_score(y_val_fold, model.predict(X_val_scaled_fold)))
    mean_auroc = np.mean(aurocs)
    print(model_names[index], " model AUROC: ", mean_auroc)
    model_aurocs.append(mean_auroc)
  
    #plot_decision_boundaries(X_train_scaled, y_train, model)

best_model_index = np.argmax(model_aurocs)
print("Best performance model is ", model_names[best_model_index])

best_model = models[best_model_index]

X_test_scaled = scaler.transform(X_test)
preds = best_model.predict(X_test_scaled)
auroc_score = roc_auc_score(y_test,preds)

print("Auroc Score:",auroc_score)

test = pd.read_csv('../data/test.csv')
test = add_subscriptions_feature_to_test(test, subscriptions)
test = add_conductor_feature_to_test(test, subscriptions, concerts, planned_concerts)
display(test.drop(columns=['ID']).head())
preds = models[3].predict(test.drop(columns=['ID']))
display(preds)
test['Predicted'] = preds
test = test.drop(columns=['num_subscriptions', 'watched_conductors'])
test.to_csv('../data/test_predictions.csv', index=False)


Unnamed: 0,account.id,label,num_subscriptions,watched_conductors
0,001i000000NuRo3,0,0.0,0.0
1,001i000000NuRxd,0,0.0,0.0
2,001i000000NuQGN,0,0.0,0.0
3,001i000000NuPfL,0,1.0,0.0
4,001i000000NuQkP,0,0.0,0.0


Found best KNN with optimal neighbors: 5
KNN  model AUROC:  0.8282301890735472
Linear SVC  model AUROC:  0.8902366665044763
RBF SVC  model AUROC:  0.8612718884223941
Perceptron  model AUROC:  0.6846007297270262
Best performance model is  Linear SVC
Auroc Score: 0.9159279873565588




Unnamed: 0,num_subscriptions,watched_conductors
0,6.0,1.0
1,1.0,2.0
2,0.0,0.0
3,0.0,0.0
4,6.0,1.0




array([0, 0, 0, ..., 0, 0, 0])

In [None]:
# from lazypredict.Supervised import LazyClassifier
# from sklearn.datasets import load_breast_cancer
# from sklearn.model_selection import train_test_split

# train = add_subscriptions_feature(train, subscriptions)

# X = train[['num_subscriptions']]
# y = train['label']

# X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,random_state =123)

# clf = LazyClassifier(verbose=1,ignore_warnings=True, custom_metric=None, classifiers=["Perceptron"])
# models,predictions = clf.fit(X_train, X_test, y_train, y_test)

# print(models)