# Pandas

### Dataframe

In [None]:
# Importamos pandas
import pandas as pd

In [None]:
data ={'Nombre' :['Jose', 'Belen', 'Pedro', 'Marcos'],'Edad' : [28,29,31,27], 'Salario' : [2000,2500,2100,2200]}

In [None]:
# Convertimos nuestro diccionario en un Dataframe
obj = pd.DataFrame(data)

print(obj)

In [None]:
# Imprimimos los valores e indices de la categoria edad
print(obj['Edad'])

In [None]:
# Imprimimos las categorias
print(obj.columns)

In [None]:
# Imprimimos los valores del Dataframe
print(obj.values)

In [None]:
# Elimino una determinada fila
print(obj.drop(1))

In [None]:
# Elimino una determinada columna
print(obj.drop('Edad', axis=1))

In [None]:
# Le damos unos valores especificos a los indices
data = pd.DataFrame(data, index =['indice1','indice2','indice3','indice4'])
print(data)

In [None]:
data[['Edad','Nombre']]

In [None]:
data.loc[:,['Edad','Salario']]

In [None]:
data.ix[:,['Edad','Salario']]

In [None]:
data.iloc[:,[0,1]]

In [None]:
data.loc['indice1':'indice3',:]

In [None]:
data.iloc[0:2,:]

In [None]:
data.loc['indice12':'indice3',['Nombre', 'Edad']]

In [None]:
data.loc[data.Edad > 28,['Nombre', 'Edad']]

In [None]:
# Devuelve la estadistica de las columnas del DataFrame
print(data.describe())

In [None]:
data.loc[:,['Salario']].mean()

In [None]:
data.loc[:,['Edad']].min()

In [None]:
# A veces en un DataFrame aparecen valores no deseados (NaN)

data = pd.DataFrame ([[2.3,3.3,float('NaN')],[7.5,float('NaN'),9.8],[float('NaN'),2.2,6.8],[5.6,9.2,7.4],[float('NaN'), float('NaN'), float('NaN')]])

In [None]:
#Elimina las filas con valores NaN
print(data.dropna())

In [None]:
#Elimina las filas con todos valores NaN
print(data.dropna(how='all'))

In [None]:
#Rellena las filas con NaN, con 0
print(data.fillna(0))

In [None]:
#Rellena las filas con NaN, con la media
import numpy as np
print(data.fillna(np.mean(data)))

In [None]:
#Rellena las filas con NaN, con la media
print(data.fillna({0:2.5,1:3.0,2:5.5}))

In [None]:
# Guardamos data a un archivo csv

data ={'Nombre' :['Jose', 'Belen', 'Pedro', 'Marcos'],'Edad' : [28,29,31,27], 'Salario' : [2000,2500,2100,2200]}

pd.DataFrame(data).to_csv('data.csv')

In [None]:
data_read = pd.read_csv('data.csv')
print(data_read)

# Regresion lineal

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [None]:
# Vamos a intentar predecir a partir de la ltura del padre [cm] la altura del hijo[cm]
df ={'Altura_Padre' :[175,186,169,182,177],'Altura_Hijo' : [172,184,175,190,186]}

df = pd.DataFrame(df)

In [None]:
x_train = df['Altura_Padre'].values[:,np.newaxis]
y_train = df['Altura_Hijo'].values

In [None]:
lm = LinearRegression()
lm.fit(x_train, y_train)

In [None]:
x_new = [[175],[185],[153],[196],[188],[175],[172],[180]]

In [None]:
predicciones = lm.predict(x_new)
print(predicciones)

In [None]:
plt.scatter(x_train, y_train)
plt.plot(x_new, predicciones, color='r', linewidth=3)
plt.xlabel('Altura del padre [cm]')
plt.ylabel('Altura del hijo[cm]')
plt.show()

# Clustering

In [None]:
# Dataset : erupciones(tiempo que duro la erupcion de un volcan), waiting (tiempo entre erupciones)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [None]:
df ={'erupciones' :[3.6,1.8,3.333,2.283,4.533],'waiting' : [79,54,74,62,85]}

df = pd.DataFrame(df)

In [None]:
# Asignamos el numero de clusters en los que queremos agrupar los datos
k=2

kmeans = KMeans (n_clusters=k)

kmeans = kmeans.fit(df)

labels = kmeans.labels_

centroides = kmeans.cluster_centers_

In [None]:
df_test = [[4.671,67], [2.885,61], [1.666,90],[5.623,54], [2.678,80], [1.875,60]]

prediccion = kmeans.predict(df_test)

print(prediccion)

In [None]:
# Ploteamos los clusters

colors = ['b','r', 'g', 'k']
y=0
for x in labels:
    plt.scatter(df.iloc[y,0], df.iloc[y,1],color = colors[x])
    y = y+1
    
for x in range(k):
    lines = plt.plot (centroides[x,0],centroides[x,1], 'kx')
    
    plt.setp(lines, ms = 15.0)
    plt.setp(lines,mew = 2.0)
    
title = ('Numero de clusters (k) = {}'.format(k))
plt.title(title)
plt.xlabel('erupciones(min)')
plt.ylabel('waiting(min)')
plt.show()

# Otros Ejemplos

In [None]:
# Introduction to Pandas

import pandas as pd
import numpy as np

#### Pandas Data Structures

#### Series

**Series** es un vector de datos (como un NumPy array) con un indice que etiqueta cada elemento del vector.

In [None]:
counts = pd.Series([632, 1638, 569, 115])
counts

# Creamos la serie cuentas

In [None]:
# Si no se especifica un indice se asignan automaticamente
counts.values

In [None]:
#Con el metodo index asignamos los indices de la serie
counts.index

In [None]:
# No solo podemos asignar numeros como indices, tambien etiquetas

bacteria = pd.Series([632, 1638, 569, 115], 
    index=['Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes'])

bacteria

In [None]:
# Igual que con un indice podemos acceder al elemento que tiene asignado
bacteria['Actinobacteria']

In [None]:
# Me devuelve todas las entradas que acaban con el string bacteria
bacteria[[name.endswith('bacteria') for name in bacteria.index]]

In [None]:
bacteria.name = 'counts'
#Asignamos a los indices una categoria (nombre de los indices)
bacteria.index.name = 'phylum'
bacteria

In [None]:
# Podemos realizar operaciones sobre la serie de datos sin perder informacion o la estructura de organizacion
np.log(bacteria)

In [None]:
# Podemos filtrar los datos por los indices
bacteria[bacteria>1000]

### DataFrame

Series multivariantes, similar a tablas o a una hoja de calculo

In [None]:
data = pd.DataFrame({'value':[632, 1638, 569, 115, 433, 1130, 754, 555],
                     'patient':[1, 1, 1, 1, 2, 2, 2, 2],
                     'phylum':['Firmicutes', 'Proteobacteria', 'Actinobacteria', 
    'Bacteroidetes', 'Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes']})
data

In [None]:
#Se pueden reordenar las categorias
data[['phylum','value','patient']]

In [None]:
# Devuelve los encabezados
data.columns

In [None]:
# Nos muestra el indice y los valores
data['patient']

In [None]:
# Devuelve la informacion de la categoria
data.patient

In [None]:
# Otra manera de obtener la informacion de la categoria
data[['value']]

In [None]:
type(data.value)

# Ejercicio

Prueba los siguientes comandos:

- `data.head()`
- `data.tail(3)`
- `data.shape`

Otra manera de iniciar un  `DataFrame` es como una lista de diccionarios

In [None]:
data = pd.DataFrame([{'patient': 1, 'phylum': 'Firmicutes', 'value': 632},
                    {'patient': 1, 'phylum': 'Proteobacteria', 'value': 1638},
                    {'patient': 1, 'phylum': 'Actinobacteria', 'value': 569},
                    {'patient': 1, 'phylum': 'Bacteroidetes', 'value': 115},
                    {'patient': 2, 'phylum': 'Firmicutes', 'value': 433},
                    {'patient': 2, 'phylum': 'Proteobacteria', 'value': 1130},
                    {'patient': 2, 'phylum': 'Actinobacteria', 'value': 754},
                    {'patient': 2, 'phylum': 'Bacteroidetes', 'value': 555}])

data

In [None]:
# Añado una columna nueva con el año
data['year'] = 2013
data

In [None]:
# Se puede leer desde un archivo csv o un excel usando el metodo pd.read_csv

# ejemplo = pd.read_csv("datos.csv")
#Ten en cuenta que la primera fila sera tomada como el encabezado (categorias)

# scikit-learn

In [None]:
import numpy as np
import scipy.stats as st
import sklearn.linear_model as lm
import matplotlib.pyplot as plt
%matplotlib inline

def f(x):
    return np.exp(3 * x)

x_tr = np.linspace(0., 2, 200)
y_tr = f(x_tr)

# Genero puntos aleatorios que quiero ajustar
x = np.array([0, .1, .2, .5, .8, .9, 1])
y = f(x) + 2 * np.random.randn(len(x))

fig, ax = plt.subplots(1, 1, figsize=(12, 6))
ax.plot(x_tr, y_tr, '--k')
ax.plot(x, y, 'ok', ms=10)
ax.set_xlim(0, 1.5)
ax.set_ylim(-10, 80)
ax.set_title('Generative model')

In [None]:
# Cargamos el modelo de regresion lineal.
lr = lm.LinearRegression()
# Entrenamos el modelo . x tiene que ser un vector 1x7
lr.fit(x[:, np.newaxis], y)
print(x)
print(x[:, np.newaxis])
print(y)
# Predecimos los valores con el modelo.
y_lr = lr.predict(x_tr[:, np.newaxis])

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 6))
ax.plot(x_tr, y_tr, '--k')
ax.plot(x_tr, y_lr, 'g')
ax.plot(x, y, 'ok', ms=10)
ax.set_xlim(0, 1.5)
ax.set_ylim(-10, 80)
ax.set_title("Linear regression")

In [None]:
lrp = lm.LinearRegression()
fig, ax = plt.subplots(1, 1, figsize=(6, 3))
ax.plot(x_tr, y_tr, '--k')

for deg, s in zip([2, 5], ['-', '.']):
    lrp.fit(np.vander(x, deg + 1), y) #Crea una matriz de Vandermore (forma de los vectores)
    y_lrp = lrp.predict(np.vander(x_tr, deg + 1))
    ax.plot(x_tr, y_lrp, s,label='degree %i' % deg)
    ax.legend(loc=2)
    ax.set_xlim(0, 1.5)
    ax.set_ylim(-10, 80)

ax.plot(x, y, 'ok', ms=10)
ax.set_title("Linear regression")

In [None]:
ridge = lm.RidgeCV()

fig, ax = plt.subplots(1, 1, figsize=(6, 3))
ax.plot(x_tr, y_tr, '--k')

for deg, s in zip([2, 5], ['-', '.']):
    ridge.fit(np.vander(x, deg + 1), y)
    y_ridge = ridge.predict(np.vander(x_tr, deg + 1))
    ax.plot(x_tr, y_ridge, s,
            label='degree ' + str(deg))
    ax.legend(loc=2)
    ax.set_xlim(0, 1.5)
    ax.set_ylim(-10, 80)

ax.plot(x, y, 'ok', ms=10)
ax.set_title("Ridge regression")

## Ejemplo

In [None]:
# read data into a DataFrame
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
data.head()

In [None]:
# Devuelve la forma del DataFrame
data.shape

In [None]:
# Visualizamos los datos
fig, axs = plt.subplots(1, 3, sharey=True)
data.plot(kind='scatter', x='TV', y='sales', ax=axs[0], figsize=(16, 8))
data.plot(kind='scatter', x='radio', y='sales', ax=axs[1])
data.plot(kind='scatter', x='newspaper', y='sales', ax=axs[2])

In [None]:
# Cargamos la api formula notation
import statsmodels.formula.api as smf

# creamos el modelo lineal
lm = smf.ols(formula='sales ~ TV', data=data).fit()

# coeficientes ajustados
lm.params

In [None]:
# Intervalos de confianza para el modelo
lm.conf_int()

# Valor del coeficiente de correlacion
lm.rsquared

## Ejemplo

In [None]:
import seaborn as sns #El paquete seaborn es una libreria implementada como ejemplo
iris = sns.load_dataset('iris') 
iris.head()
#En este caso usaremos los datos de Ronald Fisher tomados 1936 sobre la base de datos del iris
#En el eje de vertical = numero de muestras; eje horizontal = categorias/etiquetas
# X = [n_samples, n_features]

In [None]:
%matplotlib inline
import seaborn as sns; sns.set()
sns.pairplot(iris, hue='species', size=1.5);

In [None]:
X_iris = iris.drop('species', axis=1)
X_iris.shape

y_iris = iris['species']
print(y_iris.shape)
print(y_iris)

# Scikit learn

In [None]:
#1) Arrange data into a features matrix and target vector following the discussion above.
#2) Fit the model to your data by calling the fit() method of the model instance.
#3) Apply the Model to new data:
#4) For supervised learning, often we predict labels for unknown data using the predict() method.

## Regresion lineal

In [None]:
import matplotlib.pyplot as plt
import numpy as np

rng = np.random.RandomState(42)
x = 10 * rng.rand(50)
y = 2 * x - 1 + rng.randn(50)
plt.scatter(x, y);

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
model

In [None]:
X = x[:, np.newaxis]
X.shape

In [None]:
model.fit(X, y)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
rng = np.random.RandomState(42)
x = 10 * rng.rand(50)
y = 2 * x - 1 + rng.randn(50)
plt.scatter(x, y)
plt.plot(x, x*model.coef_+model.intercept_, 'r-')

## Clasificacion

In [None]:
from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X_iris, y_iris,
                                                random_state=1)

In [None]:
from sklearn.naive_bayes import GaussianNB # 1. choose model class
model = GaussianNB()                       # 2. instantiate model
model.fit(Xtrain, ytrain)                  # 3. fit model to data
y_model = model.predict(Xtest)             # 4. predict on new data

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_model)

In [None]:
from sklearn.decomposition import PCA  # 1. Choose the model class
# principal component analysis
model = PCA(n_components=2)            # 2. Instantiate the model with hyperparameters
model.fit(X_iris)                      # 3. Fit to data. Notice y is not specified!
X_2D = model.transform(X_iris)         # 4. Transform the data to two dimensions

In [None]:
iris['PCA1'] = X_2D[:, 0]
iris['PCA2'] = X_2D[:, 1]
sns.lmplot("PCA1", "PCA2", hue='species', data=iris, fit_reg=False);

## Clustering

In [None]:
from sklearn.mixture import GMM      # 1. Choose the model class
model = GMM(n_components=3,
            covariance_type='full')  # 2. Instantiate the model with hyperparameters
model.fit(X_iris)                    # 3. Fit to data. Notice y is not specified!
y_gmm = model.predict(X_iris)        # 4. Determine cluster labels

In [None]:
iris['cluster'] = y_gmm
sns.lmplot("PCA1", "PCA2", data=iris, hue='species',
           col='cluster', fit_reg=False);

In [None]:
### Ejemplo

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()
digits.images.shape

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(10, 10, figsize=(8, 8),
                         subplot_kw={'xticks':[], 'yticks':[]},
                         gridspec_kw=dict(hspace=0.1, wspace=0.1))

for i, ax in enumerate(axes.flat):
    ax.imshow(digits.images[i], cmap='binary', interpolation='nearest')
    ax.text(0.05, 0.05, str(digits.target[i]),
            transform=ax.transAxes, color='green')

In [None]:
X = digits.data
X.shape

In [None]:
y = digits.target
y.shape

In [None]:
from sklearn.manifold import Isomap
iso = Isomap(n_components=2)
iso.fit(digits.data)
data_projected = iso.transform(digits.data)
data_projected.shape

In [None]:
plt.scatter(data_projected[:, 0], data_projected[:, 1], c=digits.target,
            edgecolor='none', alpha=0.5,
            cmap=plt.cm.get_cmap('Spectral', 10))
plt.colorbar(label='digit label', ticks=range(10))
plt.clim(-0.5, 9.5);

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=0)

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_model)

In [None]:
from sklearn.metrics import confusion_matrix

mat = confusion_matrix(ytest, y_model)

sns.heatmap(mat, square=True, annot=True, cbar=False)
plt.xlabel('predicted value')
plt.ylabel('true value');

In [None]:
fig, axes = plt.subplots(10, 10, figsize=(8, 8),
                         subplot_kw={'xticks':[], 'yticks':[]},
                         gridspec_kw=dict(hspace=0.1, wspace=0.1))

test_images = Xtest.reshape(-1, 8, 8)

for i, ax in enumerate(axes.flat):
    ax.imshow(test_images[i], cmap='binary', interpolation='nearest')
    ax.text(0.05, 0.05, str(y_model[i]),
            transform=ax.transAxes,
            color='green' if (ytest[i] == y_model[i]) else 'red')