In [4]:
import pandas as pd
import numpy as np
from numpy import isnan

import warnings
from collections import defaultdict
from dateutil.parser import parse as dateparser

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from drivendata_validator import DrivenDataValidator

warnings.filterwarnings("ignore")

In [5]:
# if earlier than 1999 year (there is 1900 and 1899) return -1, else return number 
# of days since 1999-1-1
def transform_date(date):
    if pd.isnull(date):
        return date
    else:
        try:
            pivot = dateparser('1999-1-1', dayfirst=False)
            current = dateparser(date, dayfirst=False)
            if current.year < 1999:
                return -1
            else:
                delta = current - pivot
                return delta.days
        except Exception as inst:
            print 'Exception: ' + str(date)
            print type(inst)
            print inst.args

In [6]:
# exporting training data
path = '/Users/tgaponov/Desktop/cisco/kasgap/'
train_data = path + 'data/challenge_set.tsv'

train_data = pd.read_csv(train_data, delimiter='\t')

#renaming columns in training data
old = list(train_data)
new = [x.split('.')[1] for x in old]
mapper = dict(zip(old, new))

train_data.rename(columns=mapper, inplace=True)

In [7]:
# exporting testing data
test_data = path + 'data/scoring_set.tsv'
test_data = pd.read_csv(test_data, delimiter='\t')

#renaming columns in testing data
old = list(test_data)
new = [x.split('.')[1] for x in old]
mapper = dict(zip(old, new))

test_data.rename(columns=mapper, inplace=True)

In [8]:
# a column should have 60 % and more not null entries
thresh = int(len(train_data)*0.6)

In [9]:
train_data = train_data.loc[:, (train_data != train_data.iloc[0]).any()] 
train_data = train_data.dropna(axis=1, how='all')
train_data = train_data.dropna(axis=1, thresh=thresh)

182

In [10]:
# date columns in training set
dates_columns = [x for x in list(train_data) if 'date' in x]
train_data[dates_columns] = train_data[dates_columns].applymap(transform_date)
# getting all dates columns
#dates = list(data[dates_columns].stack().reset_index()[0])
#dates = [dateparser(x) for x in dates]
train_data['contract_line_duration'] = \
                        train_data['contract_line_end_date'] - \
                        train_data['contract_line_start_date']
train_data['warranty_contract_line_duration'] = \
                        train_data['warranty_contract_line_end_date'] - \
                        train_data['warranty_contract_line_start_date']

In [11]:
# date columns in testing set
test_data[dates_columns] = test_data[dates_columns].applymap(transform_date)

test_data['contract_line_duration'] = \
                        test_data['contract_line_end_date'] - \
                        test_data['contract_line_start_date']
test_data['warranty_contract_line_duration'] = \
                        test_data['warranty_contract_line_end_date'] - \
                        test_data['warranty_contract_line_start_date']

In [12]:
features_list = []
features_list += ['contract_line_duration', 'warranty_contract_line_duration']

In [13]:
# yes - no columns
yorn_columns = [x for x in list(train_data) if 'yorn' in x]

# missing values
train_data[yorn_columns] = train_data[yorn_columns].fillna('?')
test_data[yorn_columns] = test_data[yorn_columns].fillna('?')

yorn_encoder = LabelEncoder()
yorn_encoder.fit(["Y", "N", "=", '?'])

test_data[yorn_columns] = test_data[yorn_columns] \
                        .apply(lambda x: yorn_encoder.transform(x))
    
train_data[yorn_columns] = train_data[yorn_columns] \
                        .apply(lambda x: yorn_encoder.transform(x))
    
features_list += yorn_columns

In [14]:
# name columns
name_columns = [x for x in list(train_data) if 'name' in x]

train_data[name_columns] = train_data[name_columns].fillna('?')
test_data[name_columns] = test_data[name_columns].fillna('?')

# initialize encoders
name_dict = defaultdict(LabelEncoder)

for c in name_columns:
    name_dict[c].fit(train_data[c].append(test_data[c], ignore_index=True))

test_data[name_columns] = test_data[name_columns] \
                        .apply(lambda x: name_dict[x.name].transform(x))
    
train_data[name_columns] = train_data[name_columns] \
                        .apply(lambda x: name_dict[x.name].transform(x))
    
features_list += name_columns

In [15]:
# numerical columns
num_columns = ['PRODUCT_TRANSACTION_TYPE',
'CONTRACT_LINE_DURATION_IN_DAYS',
'SERVICE_CONTRACT_DISCOUNT_PERCENTAGE',
'CONTRACT_LINE_REACTION_TIME_CODE',
'SALES_HIERARCHY_LEVEL',
'SERVICE_SALES_NODE_BASE_SALES_HIERARCHY_LEVEL',
'SERVICE_FEE_AMOUNT',
'MAPPED_SERVICE_LIST_PRICE',
'SERVICE_PRODUCT_BASE_SERVICE_FEE_AMOUNT',
'SERVICE_PRODUCT_BASE_MAPPED_SERVICE_LIST_PRICE',
'CONTRACT_LINE_NET_USD_AMOUNT',
'PRODUCT_NET_PRICE',
'SERVICE_PARTNER_INSTALLED_BASE_PARTNER_RENEWAL_RATE',
'SERVICE_SALES_NODE_INSTALLED_BASE_SALES_NODE_RENEWAL_RATE',
'PRODUCT_RENEWAL_RATE',
'PARTNER_RENEWAL_RATE',
'CUSTOMER_RENEWAL_RATE',
'SALES_NODE_RENEWAL_RATE']

In [16]:
num_columns = [x.lower() for x in num_columns]
num_columns = [x for x in num_columns if x in list(train_data)]
features_list += num_columns

In [17]:
# type columns
type_columns = [x for x in list(train_data) if 'type' in x]

train_data[type_columns] = train_data[type_columns].fillna('?')
test_data[type_columns] = test_data[type_columns].fillna('?')

# initialize encoders
type_dict = defaultdict(LabelEncoder)

for c in type_columns:
    type_dict[c].fit(train_data[c].append(test_data[c], ignore_index=True))

test_data[type_columns] = test_data[type_columns] \
                        .apply(lambda x: type_dict[x.name].transform(x))
    
train_data[type_columns] = train_data[type_columns] \
                        .apply(lambda x: type_dict[x.name].transform(x))
    
features_list += type_columns

In [18]:
# code columns
code_columns = [x for x in list(train_data) if 'code' in x]

train_data[code_columns] = train_data[code_columns].fillna('?')
test_data[code_columns] = test_data[code_columns].fillna('?')

# initialize encoders
code_dict = defaultdict(LabelEncoder)

for c in code_columns:
    code_dict[c].fit(train_data[c].append(test_data[c], ignore_index=True))

test_data[code_columns] = test_data[code_columns] \
                        .apply(lambda x: code_dict[x.name].transform(x))
    
train_data[code_columns] = train_data[code_columns] \
                        .apply(lambda x: code_dict[x.name].transform(x))
    
features_list += code_columns

In [19]:
remaining_features = [x for x in list(train_data) if x not in features_list]

In [40]:
idxs = train_data.groupby(by='instance_id', as_index=True)['contract_line_end_date'].idxmax()
train_data_subset = train_data.loc[idxs]

In [54]:
features_list = list(set(features_list))
final_train_dataset = train_data_subset[features_list]
#final_train_dataset = final_train_dataset[(final_train_dataset['renewed_yorn'] != 1)]

In [55]:
clf = RandomForestClassifier(random_state=42)

X = final_train_dataset.loc[:, final_train_dataset.columns != 'renewed_yorn']
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
X_ = imp.fit_transform(X)

y = final_train_dataset['renewed_yorn']

X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.30, random_state=42)

clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [56]:
y_pred = clf.predict_proba(X_test)

In [57]:
log_loss(y_test, y_pred)

0.15501029039180236

In [36]:
# making the final predictions
final_test_dataset = test_data[features_list]
final_test_dataset = final_test_dataset.loc[:, final_test_dataset.columns != 'renewed_yorn']
X = imp.transform(final_test_dataset)
y_pred = clf.predict_proba(X)

In [39]:
y_pred = [x[2] for x in y_pred]

In [40]:
key = test_data['innovation_challenge_key']
renewal = pd.Series(y_pred)

series = [key, renewal]
cols = ['INNOVATION_CHALLENGE_KEY', 'RENEWAL_PROBABLIITY']
submission = pd.concat(series, axis=1)
submission.columns = cols

submission.sort_values(by='INNOVATION_CHALLENGE_KEY', inplace=True)

In [41]:
submission.to_csv('sub_simple.csv', index=False)

In [42]:
v = DrivenDataValidator()
v.is_valid(path+'data/submissionFormat03092018.csv', 'sub_simple.csv')

True