# Análisis de datos con Python Pandas

In [None]:
# pandas is an open source, BSD-licensed library providing high-performance, 
# easy-to-use data structures and data analysis tools for the Python programming language.
import pandas as pd

## 1. Series

**Series** is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.).

In [None]:
s = pd.Series([1,2,3,4,5], index=['a', 'b', 'c', 'd', 'e'])
s

## 2. DataFrame
**DataFrame** is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects.

In [None]:
example_data = [(1,2),(2,4),(3,9)] # Data
columns = ['A','B'] # Column names
rows = ['A','B','C'] # Row names

In [None]:
df = pd.DataFrame(data=example_data, columns=columns, index=rows)
df

### Examinar contenido del DataFrame

In [None]:
df.describe()

## 3. Análizar datos de un archivo CSV

In [None]:
df = pd.read_csv('datasets/movie_metadata.csv')

### Obtener las columnas que conforman el DataFrame

In [None]:
df.columns

In [None]:
df

### Eliminar datos nulos

In [None]:
df.head(8)

In [None]:
df.shape

In [None]:
df = df.dropna(how='any')
df.shape

### Utilizar un conjunto de características en particular del DataFrame

In [None]:
columns = ['movie_title','director_name','duration','genres','language','country','content_rating','budget','imdb_score']

In [None]:
df = df[columns]

In [None]:
df.shape

### Búsqueda por índice

In [None]:
df.iloc[:,0:2]

### Búsqueda por nombre

In [None]:
df.loc[:,['movie_title','director_name','imdb_score']]

## 4. Caso de estudio

In [None]:
import numpy as np # NumPy is the fundamental package for scientific computing with Python
import matplotlib.pyplot as plt # Matplotlib is a Python 2D plotting library
from pandas.tseries import converter as pdtc
import matplotlib.dates as mdates
import matplotlib.units as munits
munits.registry[np.datetime64] = pdtc.DatetimeConverter()

### Crear encabezados para el conjunto de datos

In [None]:
data_cols = ['node_id', 'date_time', 'temperature', 'humidity']

In [None]:
df = pd.read_csv("datasets/prototype.csv", header=None, names=data_cols)

### Examinar las características de los datos

In [None]:
df.head()

In [None]:
df.dtypes

### Conversión de datos

In [None]:
df['date_time'] = pd.to_datetime(df.date_time)
df.dtypes

### Filtrado de datos

In [None]:
df = df[(df.date_time > '2014-03-31 12:00') & (df.date_time < '2014-03-31 12:06')]
df.head(15)

### Agrupación de datos

In [None]:
df.groupby(df.node_id).describe()

### Seleccionar datos por Node ID

In [None]:
# Node 1
n1 = df[(df.node_id == 1)]
x1 = n1.date_time
y1 = n1.temperature
# Node 2
n2 = df[(df.node_id == 2)]
x2 = n2.date_time
y2 = n2.temperature
# Node 3
n3 = df[(df.node_id == 3)]
x3 = n3.date_time
y3 = n3.temperature

### Configurar Gráfica

In [None]:
fig, ax = plt.subplots()
ax.plot(x1,y1, marker='.', label='Node 1')
ax.plot(x2,y2, marker='*', label='Node 2')
ax.plot(x3,y3, marker='v', label='Node 3')
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
ax.grid(True)
fig.autofmt_xdate()
plt.legend()
plt.xlabel('Tiempo (H:M)')
plt.ylabel('Temperatura')

### Imprimir Gráfica

In [None]:
plt.show()