In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

In [None]:
#import the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [None]:
#check first five observations of the data
train_data.head(5)
test_data.head(5)

In [None]:
#check info of the data for missing values and shape 
train_data.info()
test_data.info()

In [None]:
#train_data.describe()
#test_data.describe()

In [None]:
#EDA to look at the correlation between features in the data
a = train_data.iloc[:,:9]
corr_matrix = a.corr()
plt.subplots(figsize=(10,6))
sns.heatmap(corr_matrix, vmax=0.9, square=True)

In [None]:
#Feature encoding for categorical values in the dataset
from sklearn.preprocessing import LabelEncoder

number = LabelEncoder()
train_data['gender'] = number.fit_transform(train_data['gender'].astype('str'))
test_data['gender'] = number.fit_transform(test_data['gender'].astype('str'))
train_data['device_type'] = number.fit_transform(train_data['device_type'].astype('str'))
test_data['device_type'] = number.fit_transform(test_data['device_type'].astype('str'))


In [None]:
#splitting the data
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X = train_data.iloc[:,:9].values 
y = train_data.iloc[:,9].values
test = test_data.iloc[:,:].values


#X = xgb.DMatrix(X, y)
#y = xgb.DMatrix(test)

#Standardizing the data
scaler = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)

scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

dim_x_train = X_train_scaled.reshape (X_train_scaled.shape[0],X_train_scaled.shape[1],1)
dim_x_test = X_test_scaled.reshape(X_test_scaled.shape[0],X_test_scaled.shape[1],1)

In [None]:
#from imblearn.over_sampling import SMOTE
import imblearn.over_sampling as imb
from imblearn.under_sampling import RandomUnderSampler
oversampler = imb.SMOTE(random_state=3)
from imblearn.pipeline import Pipeline
from xgboost.sklearn import XGBClassifier

k_values = [1, 2, 3, 4, 5, 6, 7]
for k in k_values:
# define pipeline
    model = XGBClassifier()
    over = SMOTE(sampling_strategy=0.3, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.2)
    steps = [('over', over), ('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    
X_train,y_train = oversampler.fit_sample(X_train_scaled, y_train)

In [None]:
#SMOTE
y.shape, (y==1).sum()

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBRegressor
rcf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV( rcf,
                   {'max_depth': [2,4,6],
                    'n_estimators': [50,100,200]}, verbose=1)
#Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
from sklearn.model_selection import GridSearchCV# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}# Create a based model
#rfc = RandomForestClassifier()
#rfc = XGBRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rcf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
#fitting the models
from sklearn.metrics import accuracy_score, f1_score, auc, roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from xgboost.sklearn import XGBRFClassifier

lr = LogisticRegression()
#research on hyperparameter tuning
#rfc = RandomForestClassifier(n_estimators = 1000, min_samples_split = 8, 
 #                            min_samples_leaf = 3, max_depth = 50, max_features = 9, criterion = 'entropy')
#xgb = XGBClassifier(n_estimators = 1600, min_samples_split = 10, 
  #                 min_samples_leaf = 2, max_depth = 15)
xgb = XGBClassifier(n_estimators = 1000, min_samples_split = 10, 
                     min_samples_leaf = 1, max_depth = 10, max_features = 2)

#rfc = RandomForestClassifier()

#lr.fit(X_train, y_train)
#rfc.fit(X_train, y_train)
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

# calculate accuracy
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.f1_score(y_test, y_pred))
print(metrics.roc_auc_score(y_test, y_pred))

In [None]:
(y_test == 1).sum(), (y_pred == 1).sum()

In [None]:
from yellowbrick.classifier import ROCAUC
# Instantiate the visualizer with the classification model
classes = [0,1]
visualizer = ROCAUC(, classes=classes)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
g = visualizer.poof()

In [None]:
#plotting the roc curve
from sklearn.metrics import accuracy_score, f1_score, auc, roc_curve, roc_auc_score
fpr, tpr, thr = metrics.roc_auc_score(y_test, y_pred)
metrics.roc_auc_score(fpr, tpr)

In [None]:
#Predicting on the test set
#Use classes_ to get the two distinct classes

#add a new column to the test data and call it test observations
test_data2 = pd.read_csv('test.csv')
    
test_data2.insert(0, 'test_observations', range(1, 1 + len(test_data2)))

test_data2['gender'] = number.fit_transform(test_data2['gender'].astype('str'))
test_data2['device_type'] = number.fit_transform(test_data2['device_type'].astype('str'))


pred_test = test_data2.iloc[:,1:]

ac = scaler.transform(pred_test)
#predictions = xgb.dump_model(ac)
predictions = xgb.classes_

#Save the outcome variable to a dataframe
resultDf = pd.DataFrame(predictions)


In [None]:
#y_pred = pd.DataFrame(y_pred)

In [None]:
np.where (y_pred == 1)

In [None]:
test2.head(5)

In [None]:
#resultDf = xgb.trees_to_dataframe()

test_data3 = test_data2
test_id = test_data3["test_observations"]

#Export to csv to get a report format for the outcome variable
sub = pd.DataFrame()
sub['test_observations'] = test_id
sub['target'] = resultDf
sub.to_csv('result.csv', index=False)

result_data = pd.read_csv('result.csv')

In [None]:
result_data.head(10)