# Analisis de Datos

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import EngFormatter
formatter = EngFormatter(places=0, sep=u"\N{THIN SPACE}")  # U+2009


## Cargar Datos

In [2]:
import datetime
import pandas as pd

def leer_data():
    outfile='./data/consulta.csv'
    data = pd.read_csv(outfile)
    return data

data = leer_data()
print('Done')

Done


In [3]:
data.head()

Unnamed: 0,client_id,date,año,mes,dia,hora,merchant_departement,merchant_province,mcc,mccg,quantity,amount_sol
0,+++g8j9k+5A=,2016-09-27 01:37:23,2016,9,27,1,LIMA,LIMA,San Juan De Miraflor,15,1,107.0
1,+++g8j9k+5A=,2016-06-24 03:35:00,2016,6,24,3,LIMA,LIMA,Magdalena Del Mar,15,1,58.0
2,+++g8j9k+5A=,2017-04-29 03:15:16,2017,4,29,3,LIMA,LIMA,Miraflores,17,1,153.0
3,+++g8j9k+5A=,2017-03-01 05:47:02,2017,3,1,5,LIMA,LIMA,Miraflores,17,1,110.0
4,+++g8j9k+5A=,2016-06-24 03:36:43,2016,6,24,3,LIMA,LIMA,Magdalena Del Mar,15,1,47.0


In [4]:
temp = data

In [5]:
temp['date'] = pd.to_datetime(temp['date'])
temp['dia_semana'] = temp['date'].dt.dayofweek
days = {0:'Mon',1:'Tues',2:'Weds',3:'Thurs',4:'Fri',5:'Sat',6:'Sun'}
temp['dia_semana'] = temp['dia_semana'].apply(lambda x: days[x])

In [6]:
days = {0:'1-Madrugada',1:'1-Madrugada',2:'1-Madrugada',3:'1-Madrugada',4:'1-Madrugada',5:'1-Madrugada',
        6:'2-Mañana',7:'2-Mañana',8:'2-Mañana',9:'2-Mañana',10:'2-Mañana',11:'2-Mañana',
        12:'3-Tarde',13:'3-Tarde',14:'3-Tarde',15:'3-Tarde',16:'3-Tarde',17:'3-Tarde',
        18:'4-Noche',19:'4-Noche',20:'4-Noche',21:'4-Noche',22:'4-Noche',23:'4-Noche'}
temp['turno'] = temp['hora'].apply(lambda x: days[x])

In [7]:
data =  temp
data.head(3)

Unnamed: 0,client_id,date,año,mes,dia,hora,merchant_departement,merchant_province,mcc,mccg,quantity,amount_sol,dia_semana,turno
0,+++g8j9k+5A=,2016-09-27 01:37:23,2016,9,27,1,LIMA,LIMA,San Juan De Miraflor,15,1,107.0,Tues,1-Madrugada
1,+++g8j9k+5A=,2016-06-24 03:35:00,2016,6,24,3,LIMA,LIMA,Magdalena Del Mar,15,1,58.0,Fri,1-Madrugada
2,+++g8j9k+5A=,2017-04-29 03:15:16,2017,4,29,3,LIMA,LIMA,Miraflores,17,1,153.0,Sat,1-Madrugada


In [8]:
y = list(data.groupby(['merchant_district']).sum()['amount_sol'])
x = list(data.groupby(['merchant_district']).sum()['amount_sol'] .index)
x[y.index(min(y))]

KeyError: 'merchant_district'

In [None]:
y = list(data.groupby(['merchant_district']).sum()['amount_sol'])
x = list(data.groupby(['merchant_district']).sum()['amount_sol'] .index)
plt.pie(y, labels=x, autopct='%1.1f%%', shadow=False, startangle=230)
plt.axis('equal') 
for i in range(len(y)):
    print(i,':  ',y[i],'     ',x[i])

In [None]:
data = data.loc[data['merchant_district']=='Lurigancho']
data.head(3)

## Graficos

### Comportamiento de Gastos totales por meses

In [None]:
# Grafico por Meses - Cantidad

nn= data.groupby(['año','mes'], as_index=False, sort=True)['client_id'].count()

fig, ax = plt.subplots()
ax.bar(nn.index, nn['client_id'], align='center')
ax.grid(color='gray', linestyle='--', linewidth=1)

ax.set_xlabel('Año')
ax.set_xticks(nn.index)

ax.set_ylabel('Frecuencia')
ax.set_yticks(nn['client_id'])
ax.set_title('Histograma')
plt.show()
nn

In [None]:
# Grafico por Meses - Montos 

nn= data.groupby(['año','mes'], as_index=False, sort=True)['amount_sol'].sum()

fig, ax = plt.subplots()
ax.bar(nn.index, nn['amount_sol'], align='center')
ax.grid(color='gray', linestyle='-', linewidth=1)

ax.set_xlabel('Año')
ax.set_xticks(nn.index)

ax.set_ylabel('Monto Total')
ax.set_yticks(nn['amount_sol'])
ax.set_title('Histograma')
plt.show()
nn

In [None]:
data.head(3)

### Distribucion de Soles

In [None]:
box = dict(facecolor='#cccc00', pad=4, alpha=0.2)
fig, axes = plt.subplots(2, 2)
ax1, ax2, ax3, ax4 = axes.flatten()
plt.subplots_adjust(top=1, bottom=0.08, left=0.10, right=0.95, hspace=0.55, wspace=0.55)


ax1.xaxis.set_major_formatter(formatter)
ax1.yaxis.set_major_formatter(formatter)
y = data.groupby(['client_id']).count()      # Us por cliente
y = y["mcc"].tolist()
mu = np.mean(y)
median = np.median(y)
sigma = np.var(y)
text = '$\mu=%.2f$\n$\mathrm{median}=%.2f$\n$\sigma=%.2f$' % (mu, median, sigma)
ax1.hist(y,100)
ax1.set_title('Cantidad de TXs mas Frecuentes')
ax1.set_xlabel('TXs Sessions', bbox=box)
ax1.set_ylabel('N° de Clientes', bbox=box)
ax1.text(0.27, 0.85, text, transform=ax1.transAxes, fontsize=8, verticalalignment='top', bbox=box)


ax2.xaxis.set_major_formatter(formatter)
ax2.yaxis.set_major_formatter(formatter)
y = list(data.groupby(['hora']).sum()['amount_sol'])
x = range(len(y))
ax2.plot(x,y,'-',color='#66ff1a')
ax2.plot(x,y,'ob',color='green')
ax2.set_title('Flujo del dinero por Horas')
ax2.set_xlabel('Horas', bbox=box)
ax2.set_ylabel('Soles', bbox=box)


y = data.groupby(['dia_semana']).sum()['amount_sol']
x = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
ax3.yaxis.set_major_formatter(formatter)
ax3.set_xticklabels(x, rotation=45)
ax3.bar(x,y,0.7,color='#ff8080')
ax3.plot(x,y,'-',color='#ff0000')
ax3.plot(x,y,'ob',color='#990000')
ax3.set_title('Flujo del dinero por Dia')
ax3.set_xlabel('Horas', bbox=box)
ax3.set_ylabel('Soles', bbox=box)


y = list(data.groupby(['turno']).sum()['amount_sol'])
x = list(data.groupby(['turno']).sum()['amount_sol'] .index)
ax4.yaxis.set_major_formatter(formatter)
ax4.set_xticklabels(x, rotation=45)
ax4.plot(x,y,'-',color='#66ff1a')
ax4.plot(x,y,'ob',color='green')
ax4.set_title('Flujo del dinero por Turno')
ax4.set_xlabel('Turnos', bbox=box)
ax4.set_ylabel('Soles', bbox=box)


plt.savefig("imagen.png",dpi = 1000)

### Distribucion de Cantidades de TXs

In [None]:
box = dict(facecolor='#ff6666', pad=4, alpha=0.2)
fig, axes = plt.subplots(2, 2)
ax1, ax2, ax3, ax4 = axes.flatten()
plt.subplots_adjust(top=1, bottom=0.08, left=0.10, right=0.95, hspace=0.55, wspace=0.55)

"""
y = list(data.groupby(['merchant_district']).sum()['amount_sol'])
x = list(data.groupby(['merchant_district']).sum()['amount_sol'] .index)
ax1.pie(y, labels=x, autopct='%1.1f%%', shadow=False, startangle=230)
ax1.axis('equal') 
"""
ax1.xaxis.set_major_formatter(formatter)
ax1.yaxis.set_major_formatter(formatter)
y = list(data.groupby(['hora']).mean()['amount_sol'])
y_max = list(data.groupby(['hora']).max()['amount_sol'])
x = range(len(y))
# ax1.plot(x,y_max,'-',color='#66ff1a')
ax1.plot(x,y,'-',color='#66ff1a')
ax1.plot(x,y,'ob',color='green')
ax1.set_title('Gasto por Horas')
ax1.set_xlabel('Horas', bbox=box)
ax1.set_ylabel('Gasto promedio', bbox=box)


ax2.xaxis.set_major_formatter(formatter)
ax2.yaxis.set_major_formatter(formatter)
y = list(data.groupby(['hora']).count()['quantity'])
x = range(len(y))
ax2.plot(x,y,'-',color='#66ff1a')
ax2.plot(x,y,'ob',color='green')
ax2.set_title('N° de TXs por Horas')
ax2.set_xlabel('Horas', bbox=box)
ax2.set_ylabel('Cantidad de TXs', bbox=box)


y = data.groupby(['dia_semana']).count()['quantity']
x = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
ax3.yaxis.set_major_formatter(formatter)
ax3.set_xticklabels(x, rotation=45)
ax3.bar(x,y,0.7,color='#ff8080')
ax3.plot(x,y,'-',color='#ff0000')
ax3.plot(x,y,'ob',color='#990000')
ax3.set_title('N° de TXs por Dia')
ax3.set_xlabel('Horas', bbox=box)
ax3.set_ylabel('Cantidad de TXs', bbox=box)


y = list(data.groupby(['turno']).count()['quantity'])
x = list(data.groupby(['turno']).count()['quantity'] .index)
ax4.yaxis.set_major_formatter(formatter)
ax4.set_xticklabels(x, rotation=45)
ax4.plot(x,y,'-',color='#66ff1a')
ax4.plot(x,y,'ob',color='green')
ax4.set_title('N° de TXs por Turno (U)')
ax4.set_xlabel('Turnos', bbox=box)
ax4.set_ylabel('Cantidad de TXs', bbox=box)


plt.savefig("imagen.png",dpi = 1000)