In [None]:
# General usage
import math
import numpy as np
import pandas as pd

# Reporting
from pandas_profiling import ProfileReport

# Preprocessing
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Modeling
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, matthews_corrcoef, confusion_matrix, precision_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

## Loading data ----


In [None]:
df = pd.read_csv('raw/modeling_set.csv')
full_execution = False

target = 'round_winner'
features = [column for column in df.columns if column != target]

X, X_val, y, y_val = train_test_split(
	df[features],
	df[target],
	test_size=0.3,
	random_state=1,
	stratify=df[target])

print(X.shape)
print(X_val.shape)
print(y.shape)
print(y_val.shape)

# To have all the columns in the same DataFrame
df = X
df[target] = y


In [None]:
target = 'round_winner'
features = [column for column in df.columns if column != target]


In [None]:
pd.set_option('display.max_rows', 100)
df.describe().T


In [None]:
if full_execution:
    profile = ProfileReport(df, title="CS:GO >> Before", minimal=True)
    profile.to_file("storage/df_report_before.html")

### Remove and filtering ----


In [None]:

# Remove fully NA columns
print(f'Shape before {df.shape}')
df.dropna(axis='columns', how='all', inplace=True)
print(f'Shape after {df.shape}')

# Remove constant value columns
print(f'Shape before {df.shape}')
col_unique = df.columns[df.nunique()==1]
df.drop(col_unique, axis=1, inplace=True)
print(f'Shape after {df.shape}')

# Remove high cardinality columns
cardinality_list = df.apply(pd.Series.nunique)/df.shape[0]*100
cardinality_list.round(1).sort_values(ascending=False)

# Remove columns with high ratio of missing values
print(f'Shape before {df.shape}')
na_threshold = len(df) * .90
df = df.dropna(thresh=na_threshold, axis=1)
print(f'Shape after {df.shape}')

# Remove duplicate rows
print(f'Number of duplicates {df.duplicated().sum()} rows to be removed')
df.drop_duplicates(inplace=True)

#### Correlation ----


In [None]:
from scipy.cluster import hierarchy
from sklearn.preprocessing import StandardScaler

def correlation_plot(df):
    scaler = StandardScaler()

    df_ = df.select_dtypes(exclude=['object'])
    df_['bomb_planted'] = 1.*df_['bomb_planted']
    df_ = scaler.fit_transform(df_)
    cov = np.cov(df_, rowvar=False)
    order = np.array(hierarchy.dendrogram(hierarchy.ward(cov),no_plot=True)['ivl'], dtype="int")

    plt.imshow(cov[order, :][:, order])

# https://stackoverflow.com/a/63536382/3780957
def correlation_filer(x: pd.DataFrame, bound: float):
    """Filters the provided DataFrame based on the threshold defined at 'bound' parameter.

    Args:
        x (pd.DataFrame): Source DataFrame
        bound (float): Threshold to limit the correlation.

    Returns:
        [type]: Filtered DataFrame
    """
    xCorr = x.corr()
    xFiltered = xCorr[((xCorr >= bound) | (xCorr <= -bound)) & (xCorr !=1.000)]
    xFlattened = xFiltered.unstack().sort_values().drop_duplicates()
    return xFlattened

correlation_plot(df)
cor_ = correlation_filer(df, .8)

# https://stackoverflow.com/a/25733562/3780957

print(f'Shape before {df.shape}')
df.drop(cor_.reset_index()['level_1'], axis=1, inplace=True)
print(f'Shape after {df.shape}')



#### Column names ----


In [None]:
column_target = "round_winner"
column_target_encoded = "round_winner_encoded"
column_map_encoded = 'map_encoded'

column_features = list(set(df.columns) - set([column_target]))
column_features.remove('map')
column_features.append(column_map_encoded)

# columns_cat = df.select_dtypes(include=['object']).columns
# columns_cat = list(set(columns_cat) - set([column_target]))

columns_float = ['bomb_planted', 'ct_health', 't_health', 'ct_armor', 't_armor', 'ct_money', 't_money']

## Feature selector ----
# C:\Users\R100983\OneDrive\GMBD\MACHINE LEARNING II (MBD-EN-BL2020J-1_32R202_380379)\Session 3\Notebooks on feature engineering\feature importance.ipynb

#### TargetEncoder ----


In [None]:
# OneDrive/GMBD/MACHINE LEARNING II (MBD-EN-BL2020J-1_32R202_380379)/Session 4 - Feature Engineering/FE BlindCredit example (original 2).ipynb

from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

le = LabelEncoder()
df[column_map_encoded] = le.fit_transform(df['map'])
df[column_target_encoded] = le.fit_transform(df[column_target])

encoder = ce.target_encoder.TargetEncoder(cols=column_map_encoded)
X_transformed = encoder.fit_transform(df[column_features], df[column_target_encoded])

### TargetRobustScalerEncoder ----


In [None]:
# C:\Users\juanb\OneDrive\GMBD\MACHINE LEARNING II (MBD-EN-BL2020J-1_32R202_380379)\Session 9 - Forum - Dimensionality Reduction\Notebook on PCA\PCA solved_v2.ipynb
# Set a variable (features) with the names of all the features BUT the target variable.
from sklearn.preprocessing import RobustScaler

cols_ = X_transformed.columns
scaler = RobustScaler().fit(X_transformed)
X_transformed = scaler.transform(X_transformed)
X_transformed = pd.DataFrame(X_transformed, columns = cols_)
y_transformed = df[column_target_encoded]

#### SymbolicTransformer ----

# C:\Users\juanb\OneDrive\GMBD\MACHINE LEARNING II (MBD-EN-BL2020J-1_32R202_380379)\Session 4 - Feature Engineering\FE BlindCredit example (original 2).ipynb


In [None]:
# C:\Users\juanb\OneDrive\GMBD\MACHINE LEARNING II (MBD-EN-BL2020J-1_32R202_380379)\Session 4 - Feature Engineering\FE BlindCredit example (original 2).ipynb

from gplearn.genetic import SymbolicTransformer

def symbolic_transformer(X_transformed, y_transformed):
    function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log',
                    'abs', 'neg', 'inv', 'max', 'min']
    gp = SymbolicTransformer(generations=10,
                            population_size=1000,
                            hall_of_fame=100,
                            n_components=12,
                            function_set=function_set,
                            parsimony_coefficient=0.0005,
                            max_samples=0.9,
                            verbose=1,
                            random_state=123,
                            n_jobs=-1)
    gp.fit(X_transformed, y_transformed)
    gp_features = gp.transform(X_transformed)

    return gp_features

gp_features = symbolic_transformer(X_transformed, y_transformed)

print(f'{gp_features.shape[1]} new features generated')


In [None]:
# C:\Users\juanb\OneDrive\GMBD\MACHINE LEARNING II (MBD-EN-BL2020J-1_32R202_380379)\Session 4 - Feature Engineering\FE BlindCredit example (original 2).ipynb
# https://stackoverflow.com/a/32801939/3780957

# Build a dataframe from the set of best new features generated.
new_dataframe = pd.DataFrame(gp_features)
# Set the name of the dataframe columns as 'gp#' where # is a number.
new_dataframe.columns = ['gp{}'.format(i) for i in range(len(list(new_dataframe)))]

# Add the new dataframe as new columns of my dataset (update the variable X)
X_transformed = pd.concat([X_transformed, new_dataframe], axis=1, sort=False)

### ReliefF ----
# Feature importance


In [None]:
# C:\Users\R100983\OneDrive\GMBD\MACHINE LEARNING II (MBD-EN-BL2020J-1_32R202_380379)\Session 3\Notebooks on feature engineering\feature importance.ipynb

from typing import List
from skrebate import ReliefF
import random

def plot_importance(features: List[str], importances: List[float]):

    num_features = len(features)
    indices = np.argsort(importances)

    plt.figure(figsize=(8, 10))
    plt.title("Feature importances")
    plt.barh(range(num_features), importances[indices],
             color="r",
             xerr=np.std(importances),
             align="center")
    plt.yticks(range(num_features), features[indices])
    plt.ylim([-1, num_features])
    plt.show()

def importance_relieff(X, y, n_features_to_select=30, n_neighbors=20, sample_rows=1000, plot = True):

    sample = random.sample(list(X_transformed.index), sample_rows)
    sample_features = X.iloc[sample, :].to_numpy()
    sample_labels = y.iloc[sample].to_numpy()

    fs = ReliefF(n_features_to_select=n_features_to_select, n_neighbors=n_neighbors)
    fs.fit(sample_features, sample_labels)
    my_important_features = fs.transform(sample_features)

    print("No. of tuples, No. of Columns before ReliefF : "+str(sample_features.shape)+
        "\nNo. of tuples , No. of Columns after ReliefF : "+str(my_important_features.shape))

    # Plot the importances, taken from the `fs` variable.
    if plot:
        plot_importance(X.columns, abs(fs.feature_importances_))

    # Get the most important column names
    my_important_features_names = [X.columns[i] for i in abs(fs.top_features_)]

    # Create a DataFrame
    X_important = pd.DataFrame(X, columns=my_important_features_names[:my_important_features.shape[1]])

    return X_important

X_important = importance_relieff(X=X_transformed, y=df[column_target_encoded], n_features_to_select=30, n_neighbors=20, sample_rows=1000)

# TODO: permutation_importance

### Outliers ----


In [None]:
from sklearn.ensemble import IsolationForest

X, y = X_important, y_transformed
print(f'Shape before {X_important.shape}')

iso = IsolationForest(contamination=0.1)
y_pred = iso.fit_predict(X)

mask = y_pred != -1
X, y = X.loc[list(mask), :], y[list(mask)]

print(f'Shape after {X.shape}')

### PCA ----
# TODO: Usar PCA
# C:\Users\juanb\OneDrive\GMBD\MACHINE LEARNING II (MBD-EN-BL2020J-1_32R202_380379)\Session 9 - Forum - Dimensionality Reduction\Notebook on PCA\PCA_v2.ipynb


In [None]:
from sklearn.decomposition import PCA
from matplotlib.cm import get_cmap

name = "Accent"
cmap = get_cmap(name)  # type: matplotlib.colors.ListedColormap
colors = cmap.colors  # type: list

def pca_transform(data, target, n=2):
    pca = PCA(n_components=n)
    principalComponents = pca.fit_transform(data.drop([target], axis=1))
    explained_pca = pca.explained_variance_ratio_

    data_pca1 = (
        pd.DataFrame(data=principalComponents).reset_index().drop(columns="index")
    )
    data_pca2 = data.loc[:, target].reset_index().drop(columns="index")
    data_pca = pd.concat([data_pca1, data_pca2], axis=1, ignore_index=True)

    # Setting columns name
    columns = [f"PC{s}" for s in range(1, n + 1)]
    columns.append(target)
    data_pca.columns = columns

    return data_pca, explained_pca

# https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60
def pca_plot_scatter(data, target, axis1=1, axis2=2):
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlabel(f"Principal Component {axis1}", fontsize=15)
    ax.set_ylabel(f"Principal Component {axis2}", fontsize=15)
    ax.set_title("Component PCA", fontsize=20)
    targets = data[target].unique()
    for target_, color in zip(targets, colors):
        indicesToKeep = data[target] == target_
        ax.scatter(
            data.loc[indicesToKeep, f"PC{axis1}"],
            data.loc[indicesToKeep, f"PC{axis2}"],
            color=color,
            s=50,
        )
    ax.legend(targets)
    ax.grid()

def pca_plot_density(data, target):
    categories = data[target].unique()
    category_series = data[target]

    fig, axs = plt.subplots(3, 3)
    fig.set_size_inches(14, 10)

    feature_names = list(set(data.columns) - set([target]))

    for subplot, feature in enumerate(feature_names):
        x, y = int(subplot / 3), subplot % 3
        for value in data[target].unique():
            sns.distplot(
                data[feature][category_series == value],
                hist=False,
                kde=True,
                kde_kws={"shade": True},
                label=str(value),
                ax=axs[x, y],
            )
        axs[x, y].set_title(feature)

    plt.tight_layout()
    plt.show()


In [None]:
data = pd.concat([X.reset_index().drop(columns="index"), y.reset_index().drop(columns="index")], axis=1)
df_pca, df_explained_variance = pca_transform(data=data, target=column_target_encoded, n=7)
print(df_explained_variance.round(2))
pca_plot_scatter(data=df_pca, target=column_target_encoded, axis1=1, axis2=2)

# TODO: Pensar en poner QDA?
# TODO: Tree Pruning
# TODO: XGboost


## Model ----

In [None]:
# C:\Users\juanb\OneDrive\GMBD\MACHINE LEARNING II (MBD-EN-BL2020J-1_32R202_380379)\Session 5 - EvaluationMetrics\Evaluation and Validation.ipynb

from sklearn.model_selection import cross_val_score, StratifiedKFold

def plot_scores(scores, labels):
    """
    Receives scores (one or several arrays) and plots a scatter to the left with
    the values of the first one, and a boxplot with all of them to the right.
    
    Arguments
        scores: single list of scores, or list of lists of scores.
        labels: single label or list of labels identifying the scores passed
    """
    plt.figure(figsize=(10, 5))
    
    plt.subplot(1, 2, 1)
    plt.title('Scores from {}.'.format(labels[0]))
    plt.scatter(range(len(scores[0])), scores[0])
    
    plt.subplot(1, 2, 2)
    plt.title('{} scores stdev={:.4f}'.format(labels[0], np.std(scores[0])))
    for i in range(len(scores)):
        plt.axhline(np.median(scores[i]), color='orange', 
                    linestyle='--', linewidth=0.5)
    plt.boxplot(scores, labels=labels)
    plt.ylim(bottom=0.6, top=1.0)
    
    plt.show()


In [None]:
### LogisticRegression ----

my_model = LogisticRegression()
cv_scores = cross_val_score(my_model, X, y, scoring='f1', cv=20)
print("F1: %0.4f (+/- %0.2f)" % (np.median(cv_scores), np.std(cv_scores)))
plot_scores([cv_scores], ['LR'])


In [None]:
### GradientBoostingClassifier ----

from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

my_model = GradientBoostingClassifier(random_state=0)
gbc_scores = cross_val_score(my_model, X, y, scoring='f1', cv=10)
print("F1: %0.4f (+/- %0.2f)" % (np.median(cv_scores), np.std(cv_scores)))
plot_scores([gbc_scores, cv_scores], ['GBC', 'LR'])


In [None]:
#### Hyperparameter tunning ----

# C:\Users\juanb\OneDrive\GMBD\STATISTICAL PROGRAMMING - PYTHON (MBD-EN-BL2020J-1_32R203_380389)\Session13_VC_Sklearn\SckitLearn-Students\05 - ScikitLearn.ipynb
# https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier

# TODO: Buscar hyperparameters https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

from sklearn.model_selection import GridSearchCV

grid = {
    'max_depth':range(5,16),
    'min_samples_split':range(200,1001),
    "random_state": [42]
}

gbc_grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid = grid, cv = 3)
gbc_grid_search.fit(X, y)
optimal_model = gbc_grid_search.best_estimator_

print("Fine Tuned Model: {0}".format(optimal_model))

In [None]:


# TODO: PCA --> LDA/QDA, luego modelo, y veo como queda. Sino, cambiar modelo