In [5]:
# dataframe and math
import pandas as pd
import numpy as np

# sklearn
from sklearn.preprocessing import LabelEncoder

# random module
import random

# visuals
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
PATHS = {
    'train': '../input/project2/train.csv/train.csv',
    'test': '../input/project2/train.csv/test.csv',
    'clean_train': "../input/p2cleanfeatures/clean_train.csv"
}

DIVISION DE DATASETS

In [7]:
clean_df = pd.read_csv(
    PATHS['clean_train'],
    index_col=0
)

In [8]:
clean_df.info()

In [9]:
HIGH_CARDINALITY = [
    'EngineVersion',
    'AppVersion', 
    'AvSigVersion',
    'OsBuildLab'
]

In [10]:
clean_df = clean_df.drop(HIGH_CARDINALITY, axis=1)

In [11]:
has_battery = []

for record in clean_df['Census_InternalBatteryNumberOfCharges']:
    entry = record if record == 0 else 1
    has_battery.append(entry)

In [12]:
has_battery_series = pd.Series(has_battery)
clean_df['HasBatteryCharges'] = has_battery_series.values

In [13]:
clean_df = clean_df.drop('Census_InternalBatteryNumberOfCharges', axis=1)

In [14]:
TO_CATEGORY = clean_df.columns[clean_df.dtypes.eq('object')]

TO_CATEGORY_NUMERIC = [
    'Census_IsSecureBootEnabled',
    'Census_IsTouchEnabled',
    'Wdft_IsGamer',
    'Wdft_RegionIdentifier'
]

In [15]:
le = LabelEncoder()

clean_df[TO_CATEGORY] = clean_df[TO_CATEGORY].apply(le.fit_transform)
clean_df[TO_CATEGORY_NUMERIC] = clean_df[TO_CATEGORY_NUMERIC].apply(le.fit_transform)

In [16]:
clean_df.head()

In [17]:
clean_df.info()

X & Y

In [18]:
target = clean_df['HasDetections']

In [19]:
features = clean_df.drop('HasDetections', axis=1)

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
sample_ratio = 0.25

In [22]:
_, X_sample, _, y_sample = train_test_split(features, target, test_size = sample_ratio)

In [23]:
print("Se trabajará con una muestra del %.0f%% de los datos originales" % (len(X_sample) / len(clean_df) * 100))

In [24]:
## Revisamos el balance de los datos
y_sample.value_counts()

SCALING

In [25]:
df_columns = X_sample.columns

In [26]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [27]:
mm_scaler = MinMaxScaler()

In [28]:
s_scaler = StandardScaler()

In [35]:
X_scaled = s_scaler.fit_transform(X_sample)
X_scaled = pd.DataFrame(X_scaled, columns=df_columns)

MODELS

In [30]:
## import for evaluations
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, mean_squared_error, precision_recall_curve
from scikitplot.metrics import plot_roc

In [31]:
## function for plotting confusion matrix in a more comprehensive format
def plot_confusion_matrix(matrix, title):
    group_names = ['Real Negativo','Falso Positivo','Falso Negativo','Real Positivo']
    group_counts = ["{0:0.0f}".format(value) for value in matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in matrix.flatten() / np.sum(matrix)]

    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    ax = sns.heatmap(matrix, annot=labels, fmt='', cmap='BuPu')

    ax.set_title('Matriz de Confusión de {}\n'.format(title));

    ax.xaxis.set_ticklabels(['No Detections','Detections'])
    ax.yaxis.set_ticklabels(['No Detections','Detections'])
    ax.set_xlabel('Valores Predichos');
    ax.set_ylabel('Valores Correctos ');
    
    plt.show()

In [32]:
test_ratio = 0.3

In [36]:
## X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size = test_ratio)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_sample, test_size = test_ratio)

In [37]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [38]:
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

In [39]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'binary:logistic',
    'eval_metric':'auc'
}

num_boost_round = 999

In [26]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [27]:
max_auc = float(0)
best_params = None

for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(max_depth, min_child_weight))
    
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'auc'},
        early_stopping_rounds=10
    )
    
    mean_auc = cv_results['test-auc-mean'].max()
    boost_rounds = cv_results['test-auc-mean'].argmax()
    
    print("\tAUC {} for {} rounds".format(mean_auc, boost_rounds))
    
    if mean_auc > max_auc:
        max_auc = mean_auc
        best_params = (max_depth,min_child_weight)
    
print("Best params: {}, {}, AUC: {}".format(best_params[0], best_params[1], max_auc))

In [33]:
params['max_depth'] = best_params[0]
params['min_child_weight'] = best_params[1]

In [35]:
params

In [40]:
xgb_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

In [41]:
xgb_preds = xgb_model.predict(dtest)

binary_preds = [0 if x < 0.5 else 1 for x in xgb_preds]

In [42]:
xgb_cm = confusion_matrix(y_test, binary_preds)

In [43]:
plot_confusion_matrix(xgb_cm, "XG Boost")

In [44]:
print(classification_report(y_test, binary_preds))

In [45]:
plot_roc(y_test, [[1.0 - f, f] for f in xgb_preds])
plt.show()

END