In [0]:
import os
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
!pip install kaggle
!mkdir ~/.kaggle
!cp /content/gdrive/My\ Drive/kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

In [0]:
!kaggle competitions download  talkingdata-adtracking-fraud-detection

In [0]:
#if not os.path.exists("/TK_Model"):
#    os.makedirs("/TK_Model")
#os.chdir('/TK_Model')
os.listdir('.')

In [0]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.model_selection import train_test_split,  cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.externals import joblib

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.wrappers.scikit_learn import KerasClassifier # keras wrapper for sklearn

import json
import zipfile

In [0]:
# The whole complete training file:
zipfile.ZipFile('train.csv.zip').extractall('train.csv')
# The whole complete train_sample file:
zipfile.ZipFile('train.csv.zip').extractall('train_sample.csv')

In [0]:
os.listdir('.')

In [0]:
data_s = pd.read_csv('train.csv/mnt/ssd/kaggle-talkingdata2/competition_files/train_sample.csv',  engine='python')

In [0]:
data_s.sort_values(by='attributed_time', ascending = False).head()

In [0]:
data_s.shape

In [0]:
# Checking for NANs
for col in data_s.columns:
    print('{} contains {:,} NANs'.format(col,len(data_s[data_s[col].isnull()])))

Only attributed time has NANs, but I am not using this column

In [0]:
# Removing attributed_time
data_s = data_s.drop('attributed_time', axis = 1)

In [0]:
data_s.head()

In [0]:
# Convert click_time to datetime format
data_s.click_time = pd.to_datetime(data_s.click_time, errors = 'ignore')

In [0]:
# Determine and coding time of the day the site was accessed
# 1 = Night, 2 = Morning, 3 = Afternoon, 4 = Evening
data_s = data_s.assign(session = pd.cut(data_s.click_time.dt.hour,[-1,6,12,18,24],labels=[1, 2, 3,4 ]))
#data_s.session = data_s.session.astype(np.int64)
#data_s = data_s.drop('click_time', 1)

In [0]:
data_s.head()

In [0]:
data_s = data_s[['ip', 'app', 'device', 'os', 'channel', 'session','is_attributed']]

In [0]:
from collections import defaultdict
d = defaultdict(LabelEncoder)
# Encoding the variable
data_e = data_s.apply(lambda x: d[x.name].fit_transform(x))

# Inverse the encoded
#fit.apply(lambda x: d[x.name].inverse_transform(x))
# Using the dictionary to label future data
#data_s.apply(lambda x: d[x.name].transform(x))

In [0]:
data_e.head()

In [0]:
data_s = data_s.reset_index(drop = False).rename(columns={'index':'click_id'})

In [0]:
data_s.is_attributed.unique()

In [0]:
data_s.info()

In [0]:
data_s.loc[:,'session'] = data_s.session.astype(np.int)

In [0]:
data_s.sort_values(by='ip').head()

# Aritificial Neural Network 

### Split data into target and features

In [0]:
target = data_s.is_attributed.values
features = data_s.drop('is_attributed', 1).values
print ('Data original %d, target: %d, features: %d' % (data_s.shape[0], target.shape[0], features.shape[0]))

### Split features into training, validation, and testing sets

In [0]:
X_train, X_test, y_train, y_test  = train_test_split(features, target, test_size= 0.25, random_state=1)
X_train, X_val, y_train, y_val    = train_test_split(X_train, y_train, test_size= 0.20, random_state=1)

X_train_names = X_train[:,0]
X_train = X_train[:,1:7]

X_test_names = X_test[:,0]
X_test = X_test[:,1:7]
                  
X_val_names = X_val[:,0]
X_val = X_val[:,1:7]

### Standarizing the features

In [0]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val   = sc.transform(X_val)
X_test  = sc.transform(X_test)

In [0]:
X_train

### Building the initial model 

In [0]:
def classifier_model(activator, optimizer, initializer):
    classifier = Sequential()
    classifier.add(Dense(units = 10, kernel_initializer = initializer, activation = activator, input_dim = 6))
    classifier.add(Dense(units = 10, kernel_initializer = initializer, activation = activator))
    classifier.add(Dense(units = 1,  kernel_initializer = initializer, activation = 'sigmoid'))
    classifier.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier

### Fitting the model with the training set 

In [0]:
activator   = 'relu'
optimizer   = 'adam'
initializer = 'uniform'
classifier = classifier_model(activator, optimizer, initializer)
classifier.fit(X_train, y_train, batch_size = 10, epochs = 10)

### Crossvalidation on training set

In [0]:
# Using relu - as raw model above - as activator
activator   = 'relu'
optimizer   = 'adam'
initializer = 'uniform'
cv = 10 
classifier_cv = KerasClassifier(build_fn = classifier_model, 
                                activator = activator, 
                                optimizer = optimizer,
                                initializer = initializer,
                                batch_size = 10, 
                                epochs = 10, 
                                verbose = 0)
accuracies = cross_val_score(estimator = classifier_cv, X = X_train, y = y_train, cv = cv)

mean = accuracies.mean()
variance = accuracies.std()

In [0]:
# Mean cv accuracy 
print('Mean cv accuracy (relu) = {:.4f}% +/- {:.4f}'.format(mean *100, variance *100) )

### Validation 

In [0]:
loss, accuracy = classifier.evaluate(X_val, y_val, batch_size = 10, verbose = 0)
print("Validation set accuracy = {:.4f}%, Loss = {:.4f}".format(accuracy* 100, loss))

In [0]:
loss, accuracy = classifier.evaluate(X_train, y_train , batch_size = 128, verbose = 0)
print(round(accuracy*100,2))

### Optimization

Using GridSearch

In [0]:
classifier_op = KerasClassifier(build_fn = classifier_model)

parameters = {'batch_size'  : [10, 30],
              'epochs'      : [10, 20],
              'optimizer'   : ['adam', 'rmsprop'],
              'activator'   : ['relu', 'sigmoid'],
              'initializer' : ['uniform']}

grid_search = GridSearchCV(estimator = classifier_op,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10)

grid_search = grid_search.fit(X_train, y_train)

best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

In [0]:
best_accuracy

In [0]:
best_parameters

# OPTIMIZED MODEL 

In [0]:
activator   = 'relu'
optimizer   = 'adam'
initializer = 'uniform'
epoch = 10 
classifier_opt = classifier_model(activator, optimizer, initializer)
classifier_opt.fit(X_train, y_train, batch_size = 10, epochs = epoch, verbose = 0)

In [0]:
# Predicting on the test set 
y_pred = classifier_opt.predict(X_test)
y_pred 

In [0]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred.round(), average='weighted')

In [0]:
y_pred.round()

In [0]:
y_classes = y_pred.argmax(axis=-1)
y_a = pd.DataFrame([y_classes])
y_b = pd.DataFrame([y_test])
y_c = pd.DataFrame([X_test_names])
y_a = pd.concat((y_a,y_b, y_c)).T
y_a.columns = ['predicted', 'actual','click_id']
y_a.head()

In [0]:
# Number of mismatches within the test set
y_a['dif'] = np.where(y_a.predicted == y_a.actual, 0, 1)
y_a[y_a['dif'] == 1].actual.value_counts().sort_index()

In [0]:
classifier_opt.save('classifier.hdf5')
joblib.dump(sc, 'scaler.pkl') 

------------------
PREDICTION ON THE TESTING DATASET

In [0]:
from keras.models import load_model
classifier = load_model('classifier.hdf5')
from sklearn.preprocessing import StandardScaler
sc = joblib.load('scaler.pkl')

In [0]:
test = pd.read_csv('test.csv')

In [0]:
test.head()

In [0]:
# Convert click_time to datetime format
test.click_time = pd.to_datetime(test.click_time, errors = 'ignore')

In [0]:
# Checking for NANs
for col in test.columns:
    print('{} contains {:,} NANs'.format(col,len(test[test[col].isnull()])))

In [0]:
# Determine and coding time of the day the site was accessed
# 1 = Night, 2 = Morning, 3 = Afternoon, 4 = Evening
test = test.assign(session=pd.cut(test.click_time.dt.hour,[-1,6,12,18,24],labels=[1, 2, 3,4 ]))
test = test.drop('click_time', 1)

In [0]:
test.columns

In [0]:
test = test[['ip', 'app', 'device', 'os', 'channel', 'session']]

In [0]:
# Using the dictionary to label future data
test.apply(lambda x: d[x.name].transform(x))

In [0]:
test = test.reset_index(drop = False)
test = test.rename(columns={'index':'click_id'})
test.head()

In [0]:
print(len(test.channel.unique()))
test.channel.value_counts().sort_index()

In [0]:
test.dtypes

In [0]:
test_x_names = test.click_id
test_x = test.drop('click_id', 1)


In [0]:
X_2016 = sc.transform(test_x)

In [0]:
# Predicting on the test set 
predictions = classifier.predict(X_2016)
predictions

In [0]:
pred_clas = classifier.predict_classes(X_2016)

In [0]:
labels_ = predictions.argmax(axis=1)
#labels_ = labels_> 0.5
labels_

In [0]:
a = pd.concat([pd.DataFrame(test_x_names), pd.DataFrame(predictions)], axis =1).rename(columns={0:'is_attributed'})        

In [0]:
a.to_csv('First.csv', header = True, index = False)

In [0]:
a.to_csv('primera.csv', header = True, index = False)

In [0]:
predictions1 = predictions