# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Datasets

In [2]:
train_df = pd.read_csv('train.csv',encoding = 'ISO-8859-1',low_memory=False)
test_df = test_df = pd.read_csv('test.csv')

# Training set transformation 

In [3]:
#Removing records with compliance as NaN
train_df = train_df.dropna(subset=['compliance'])

#train_df.info() - uncomment to understand more about data and values.

#Columns are selected on the basis of domain knowledge. Categorical variables with large no. of unique values are dropped.
columns_train = ['disposition','fine_amount','admin_fee','state_fee',
                  'late_fee','discount_amount','clean_up_cost',
                  'judgment_amount',
                  'compliance']

train_df = train_df[columns_train]

#Category set union with string None for any NaN values
convert_categorical = set(train_df['disposition'])|{'None'}

#Encoding categorical variable disposition
train_df['disposition']= (pd.Categorical(train_df['disposition'],
                     categories=convert_categorical).fillna('None').codes)

# Test set transformation

In [4]:
#Here ticket_id is kept as it is required in the result expected.
columns_test = ['ticket_id','disposition','fine_amount','admin_fee','state_fee',
                      'late_fee','discount_amount','clean_up_cost','judgment_amount']
    
test_df = test_df[columns_test]
test_df = test_df.set_index('ticket_id')

convert_categorical_test = set(test_df['disposition'])|{'None'}
test_df['disposition']= (pd.Categorical(test_df['disposition'],
                     categories=convert_categorical_test).fillna('None').codes)

# Preparing data

In [5]:
#Training set
X = train_df.iloc[:,:-1]

#Target 
y = train_df.iloc[:,-1]

#Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Cross- Validation and Model Selection

In [6]:
#Performing 10-fold cross validation on four models to select the best. The scoring parameter will be "roc_auc".
#We are taking K = 10 as it provides a good trade-off of low computational cost and low bias 

Models = [GaussianNB(), DecisionTreeClassifier(), RandomForestClassifier(), GradientBoostingClassifier()]

cv = KFold(n_splits=10,shuffle=True,random_state=1)

for model in Models:
    
    scores = cross_val_score(model, X, y, scoring = 'roc_auc',cv = cv, n_jobs = -1)
    
    print(f"{type(model).__name__} : AUC = {np.mean(scores)}")

GaussianNB : AUC = 0.7769058555328032
DecisionTreeClassifier : AUC = 0.7946611059114735
RandomForestClassifier : AUC = 0.7947565540493577
GradientBoostingClassifier : AUC = 0.7947965715168546


# Grid Search for best parameter selection

In [7]:
gbc_grid = GradientBoostingClassifier()

parameters = {'n_estimators':[10,30,50],
              'learning_rate':[0.01,0.1,1],
              'max_depth':[3,4,5]}

grid_auc = GridSearchCV(gbc_grid, param_grid = parameters, scoring = 'roc_auc')
grid_auc.fit(X_train,y_train)

y_auc = grid_auc.decision_function(X_test)

print(f"Test Set AUC : {roc_auc_score(y_test, y_auc)}")
print(f"Best Parameters : {grid_auc.best_params_}")
print(f"Grid Best AUC Score : {grid_auc.best_score_}")

Test Set AUC : 0.7943714941109158
Best Parameters : {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 50}
Grid Best AUC Score : 0.795008793675031


# Final Model Training with given parameters 

In [8]:
gbc = GradientBoostingClassifier(learning_rate = 0.1, max_depth = 4, n_estimators = 50)

gbc.fit(X_train, y_train)

#Result Prediction
y_predict = gbc.predict(test_df)

#Probability prediction
y_prob = gbc.predict_proba(test_df)

# Result

In [9]:
y_df = pd.DataFrame(y_prob, index = test_df.index)

compliance = y_df.iloc[:,1].rename('compliance').astype('float32')

print(compliance)

ticket_id
284932    0.049338
285362    0.015840
285361    0.059483
285338    0.049338
285346    0.059483
            ...   
376496    0.015840
376497    0.015840
376499    0.059483
376500    0.059483
369851    0.685503
Name: compliance, Length: 61001, dtype: float32
