In [40]:
import sys
sys.path.append('../scripts')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,accuracy_score, f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from matplotlib.colors import ListedColormap
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron

from sklearn.metrics import ConfusionMatrixDisplay

import data_cleaning as cd
import myfeatures as ft
import mytrain as tr
import mypredictions as pr

In [41]:
### Import data sources ###

# Training data
train = pd.read_csv('../data/train.csv')

#Previously purchased subscriptions by account
subscriptions = pd.read_csv('../data/subscriptions.csv')
#display(subscriptions.head())

# Location info for each patron and donation history
accounts = pd.read_csv('../data/account.csv')
#display(accounts.head())

# Previous concerts by season
concerts = pd.read_csv('../data/concerts.csv')
#display(concerts.head())

# List of planned concert sets for the 2014-15 season
planned_concerts = pd.read_csv('../data/concerts_2014-15.csv')
#display(planned_concerts.head())

# Previously purchased tickets by account
tickets = pd.read_csv('../data/tickets_all.csv')
#display(tickets.head())

# Location and demographic information for zipcodes
zipcodes = pd.read_csv('../data/zipcodes.csv')
#display(zipcodes.head())

# Final test data
final_test = pd.read_csv('../data/test.csv')

In [42]:
print("Training set:")
# check how many account_id from training data are present in the other data sources
print("Subscriptions account ids: ")
display(pd.Series(train['account.id'].unique()).isin(subscriptions['account.id']).value_counts())

print("Accounts account ids: ")
display(pd.Series(train['account.id'].unique()).isin(accounts['account.id']).value_counts())

print("Tickets account ids: ")

tickets = tickets.loc[~tickets['price.level'].isin(['Adult', 'Youth', 'GA'])]
display(pd.Series(train['account.id'].unique()).isin(tickets['account.id']).value_counts())

print("\nTest set:")
# check how many account_id from test data are present in the other data sources
print("Subscriptions account ids: ")
display(pd.Series(final_test['ID'].unique()).isin(subscriptions['account.id']).value_counts())

print("Accounts account ids: ")
display(pd.Series(final_test['ID'].unique()).isin(accounts['account.id']).value_counts())

print("Tickets account ids: ")
display(pd.Series(final_test['ID'].unique()).isin(tickets['account.id']).value_counts())



Training set:
Subscriptions account ids: 


False    4736
True     2205
Name: count, dtype: int64

Accounts account ids: 


True    6941
Name: count, dtype: int64

Tickets account ids: 


False    6225
True      716
Name: count, dtype: int64


Test set:
Subscriptions account ids: 


False    2032
True      943
Name: count, dtype: int64

Accounts account ids: 


True    2975
Name: count, dtype: int64

Tickets account ids: 


False    2686
True      289
Name: count, dtype: int64

In [43]:
subscriptions, accounts, concerts, planned_concerts, tickets, zipcodes = cd.clean_data(subscriptions, accounts, concerts, planned_concerts, tickets, zipcodes)

Missing subscriptions data:  account.id              0
season                  0
package                 6
no.seats                0
location                6
section              4543
price.level          3534
subscription_tier       0
multiple.subs           0
dtype: int64 out of 28627 

Dropping rows with missing data...
Initial length of subscriptions:  28627
Length of subscriptions after cleaning:  24083
Missing accounts data:  account.id                     0
shipping.zip.code          19569
billing.zip.code            2955
shipping.city              19561
billing.city                2218
relationship               19172
amount.donated.2013            0
amount.donated.lifetime        0
no.donations.lifetime          0
first.donated              14298
dtype: int64 out of 19833 

Dropping rows with missing data...
Initial length of accounts:  19833
Length of accounts after cleaning:  1
Missing concerts data:  season          0
concert.name    0
set             0
who             0
w

In [44]:
def train_model():
    df_train = ft.build_features(train, accounts, subscriptions, tickets)

    X_train = df_train.drop(columns=['account.id_x', 'account.id_y', 'label', 'ID'])
    y_train = df_train['label']

    #X_train.dropna(inplace=True)
    print('X Train:')
    display(X_train.head())
    print('Training set size:', X_train.shape)

    model = tr.train_randomforest(X_train, y_train)
    preds = model.predict(X_train)

    # Get training score
    auroc_score = roc_auc_score(y_train, preds)

    print("Training Auroc Score:",auroc_score)

    return model

In [45]:
def test_model(model):
    # Predict on final test data
    df_test = ft.build_features(final_test, accounts, subscriptions, tickets)
    X_test = df_test.drop(columns=['account.id', 'ID'])

    print('Testing set size:', X_test.shape)

    preds = model.predict(X_test)

    ids = final_test['ID']
    pd.DataFrame({'ID': ids, 'Predicted': preds}).to_csv('../data/test_predictions.csv', index=False)

In [46]:
best_model = train_model()
test_model(model=best_model)

X Train:


Unnamed: 0,num_subscriptions,avg_subscription_price_level,avg_subscription_tier,num_tickets,num_tickets_2013,avg_ticket_price_level,num_seats,num_seats_2013,amount.donated.2013,amount.donated.lifetime,no.donations.lifetime
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,


Training set size: (6941, 11)


'Training Random Forest...'

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values