Analisis de la carrera en el Barça de Messi
==

Ante la duda sobre que significan alguno de estos items, podemos visitar la [explicacion de las columnas](https://www.kaggle.com/abhijithchandradas/lionel-messi-at-f-c-barcelona/version/1).

<img src="https://as01.epimg.net/futbol/imagenes/2021/03/21/primera/1616352554_970003_1616352733_noticia_normal_recorte1.jpg" />

Si hay algo raro que nos toco ver ultimamente, es verlo a Messi con una camiseta distinta a la del Barça. Como bien lo dijo el idolo:

_"El Barça es mi vida. Amo al Barça y a Barcelona. Siempre lo he dado todo por este Club"_

En honor a la carrera del idolo argentino en el club catalan, analisemos su paso por el Barça en numeros.

## Importar los datos

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
messi_csv = pd.read_csv('messi_barca.csv', encoding = "ISO-8859-1", delimiter=',')  
messi_csv

In [None]:
messi_csv.info()

In [None]:
messi_csv.keys()

Importamos algunas librerias

In [None]:
import graphviz
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

# Datos

In [None]:
competiciones = messi_csv['Competition']
comp = []
for c in competiciones:
    if c == 'Champions League':
        comp.append(0)
    if c == 'La Liga':
        comp.append(1)
    if c == 'Copa del Rey':
        comp.append(2)
    if c == 'Supercopa de España':
        comp.append(3)
        
Y = np.array(comp)
X = np.array(messi_csv[['Matches Played', 'Goals scored']])

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(X[:,0], X[:,1])
plt.xlabel('Matches Played')
plt.ylabel('Goals scored')
plt.show()

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
cdict = {0: 'red', 1: 'green', 2: 'blue', 3: 'yellow'}
catdict= {0:'Champions League', 1: 'La Liga', 2:'Copa del Rey', 3:'Supercopa de España'}

figure, ax = plt.subplots(figsize=(8,8))

labels = Y
for g in np.unique(labels):
    ix = np.where(g == labels)
    ax.scatter(X[ix,0], X[ix,1], c = cdict[g], label = catdict[g], s = 45,edgecolor='k')

plt.xlabel('Matches Played')
plt.ylabel('Goals scored')
ax.legend()
plt.show()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=12, shuffle=True)

# KNN

In [None]:
knn = KNeighborsClassifier(5)

In [None]:
knn.fit(x_train, y_train)

In [None]:
y_pred = knn.predict(x_test)

In [None]:
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 0.5,  X[:, 0].max() + 0.5
y_min, y_max =  X[:, 1].min() - 0.5,  X[:, 1].max() + 0.5
h = 0.1  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap = "cool", shading ='auto')

# Plot also the training points
plt.scatter(x_train[:, 0], x_train[:, 1], c=y_train, edgecolors="k", cmap="jet",label='Training Points')
# Plot also the testing points
plt.scatter(x_test[:, 0], x_test[:, 1], c=y_pred, edgecolors="k", cmap="jet",marker="^",label='Test Points')
plt.legend(loc="upper left")
plt.xlabel("Matches played")
plt.ylabel("Goals Scored")

plt.show()

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
cm_display = ConfusionMatrixDisplay(cm).plot()

In [None]:
y_pred_train = knn.predict(x_train)

In [None]:
cm_train = confusion_matrix(y_train, y_pred_train)

In [None]:
cm_display = ConfusionMatrixDisplay(cm_train).plot()

# Arboles de decision

In [None]:
np.random.seed(123)

arbol = DecisionTreeClassifier(max_depth = 5)

In [None]:
arbol.fit(x_train, y_train)

In [None]:
y_pred = arbol.predict(x_test)

#Obtengo las metricas de error

print("El accuracy obtenido sin preprocesamiento es: ",arbol.score(x_test,y_test))

In [None]:
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 0.5,  X[:, 0].max() + 0.5
y_min, y_max =  X[:, 1].min() - 0.5,  X[:, 1].max() + 0.5
h = 0.1  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = arbol.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap = "cool", shading ='auto')

# Plot also the training points
plt.scatter(x_train[:, 0], x_train[:, 1], c=y_train, edgecolors="k", cmap="jet",label='Training Points')
# Plot also the testing points
plt.scatter(x_test[:, 0], x_test[:, 1], c=y_pred, edgecolors="k", cmap="jet",marker="^",label='Test Points')
plt.legend(loc="upper left")
plt.xlabel("Matches played")
plt.ylabel("Goals Scored")

plt.show()

In [None]:
features_names = ['Matches Played', 'Goals scored']
y_names = ['Champions League', 'La Liga','Copa del Rey','Supercopa de España']

In [None]:
export_graphviz(arbol, out_file='arbol1.dot', class_names=y_names, feature_names=features_names, impurity=False, filled=True)
with open('arbol1.dot') as f:
    dot_graph=f.read()
graphviz.Source(dot_graph)

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
cm_display = ConfusionMatrixDisplay(cm).plot()

# Escalado & normalizado

In [None]:
X = np.array(messi_csv[['Goals scored', 'Minutes played']])

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(X[:,0], X[:,1])
plt.ylabel('Minutes Played')
plt.xlabel('Goals scored')
plt.show()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=12, shuffle=True)

In [None]:
knn = KNeighborsClassifier(5)

In [None]:
knn.fit(x_train, y_train)

In [None]:
y_pred = knn.predict(x_test)

#Obtenemos el accuracy

print("El accuracy obtenido sin escalar es:", round(knn.score(x_test, y_test),1))

In [None]:
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 0.5,  X[:, 0].max() + 0.5
y_min, y_max =  X[:, 1].min() - 0.5,  X[:, 1].max() + 0.5
h = 1.0  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap = "cool", shading ='auto')

# Plot also the training points
plt.scatter(x_train[:, 0], x_train[:, 1], c=y_train, edgecolors="k", cmap="jet",label='Training Points')
# Plot also the testing points
plt.scatter(x_test[:, 0], x_test[:, 1], c=y_pred, edgecolors="k", cmap="jet",marker="^",label='Test Points')
plt.legend(loc="upper left")
plt.xlabel("Minutes Played")
plt.ylabel("Goals Scored")

plt.show()

In [None]:
from sklearn.preprocessing import scale
from sklearn.preprocessing import minmax_scale

In [None]:
X

In [None]:
scale(X)

In [None]:
minmax_scale(X)

# Tarea

Usando:

In [None]:
X_1 = scale(X)
X_2 = minmax_scale(X)


In [None]:
#Comienzo por correr el KNN con el escalado mediante las dos técnicas

#separo los dataset
x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(X_1, Y, test_size=0.2, random_state=12, shuffle=True)
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(X_2, Y, test_size=0.2, random_state=12, shuffle=True)

#entreno el modelo


knn_1 = knn.fit(x_train_1,y_train_1)

knn_2 = knn.fit(x_train_2,y_train_2)

# #armo las predicciones

y_pred_1 = knn_1.predict(x_test_1)

y_pred_2 = knn_2.predict(x_test_2)

#Obtengo el accuracy para ambos modelos

print("El accuracy obtenido con el escalado es: ",knn_1.score(x_test_1,y_test_1))
print("El accuracy obtenido con el escalado min max es: ",knn_2.score(x_test_2,y_test_2))


Correr nuevamente KNN y DecisionTreeClassifier y comparar los resultados con los del dataset sin normalizar o escalar.

In [None]:
#Aplico el preprocesamiento al decision tree

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer

#Escalado
escala=MinMaxScaler()

escala.fit(X)

X_escalada = escala.transform(X)

#Normalizado

normalize = Normalizer()

normalize.fit(X)

X_normalizada = normalize.transform(X)


In [None]:

np.random.seed(123)
#Comienzo por correr el KNN con el escalado mediante las dos técnicas

#separo los dataset
x_train_sc, x_test_sc, y_train_sc, y_test_sc = train_test_split(X_escalada, Y, test_size=0.2, random_state=12, shuffle=True)
x_train_n, x_test_n, y_train_n, y_test_n = train_test_split(X_normalizada, Y, test_size=0.2, random_state=12, shuffle=True)

#entreno el modelo

arbol = DecisionTreeClassifier(max_depth = 5)

y_pred_sc = arbol.fit(x_train_sc,y_train_sc)
y_pred_norm = arbol.fit(x_train_n,y_train_n)

#Obtenemos las métricas

print("El accuracy obtenido con el escalado min max es: ",arbol.score(x_test_sc,y_test_sc))
print("El accuracy obtenido con el normalizado: ",arbol.score(x_test_n,y_test_n))

