## Leer datos con la función **read_csv**

https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html

In [3]:
import pandas as pd
import numpy as np
import os

In [8]:
data = pd.read_csv("datasets/customer-churn-model/Customer Churn Model.csv") #filepath


## Parámetros de la función **read_csv** (30 argumentos)

In [9]:
data.head()

Unnamed: 0,"State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?"
0,"KS,128,415,382-4657,no,yes,25,265.100000,110,4..."
1,"OH,107,415,371-7191,no,yes,26,161.600000,123,2..."
2,"NJ,137,415,358-1921,no,no,0,243.400000,114,41...."
3,"OH,84,408,375-9999,yes,no,0,299.400000,71,50.9..."
4,"OK,75,415,330-6626,yes,no,0,166.700000,113,28...."


### Ejemplos de los parámetros de la función read_csv
```
read.csv(filepath="datasets/customer-churn-model/Customer Churn Model.csv/titanic3.csv",
        sep = ",",         
        dtype={"ingresos":np.float64, "edad":np.int32}, 
        header=0,names={"ingresos", "edad"},
        skiprows=12, 
        index_col=None, 
        skip_blank_lines=False, 
        na_filter=False
        )
```
1. **sep** separado por comas
2. **dtype** columna ingresos de tipo flotante. El valor determinado es None, quiere decir que panda asignara el tipo que mas le convenga
3. **header** donde esta la cabecera, y cuales quieres utilizar
4. **skiperows** saltar filas 
5. **index_col** cambiar la columna del index
6. **skipe_blank** saltar lineas en blancos
7. **na_filter** elimina NaN

In [10]:
mainpath = "datasets/" #carpeta global
filename = "customer-churn-model/Customer Churn Model.csv" #dataset
fullpath = os.path.join(mainpath, filename)
data = pd.read_csv (fullpath)
data2 = pd.read_csv(mainpath + "/" + "customer-churn-model/Customer Churn Model.txt")
data2.head()

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


In [9]:
data2.columns.values

array(['State', 'Account Length', 'Area Code', 'Phone', "Int'l Plan",
       'VMail Plan', 'VMail Message', 'Day Mins', 'Day Calls',
       'Day Charge', 'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins',
       'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls',
       'Intl Charge', 'CustServ Calls', 'Churn?'], dtype=object)

## Cambiar la cabecera a data2

In [10]:
# Leer un archivo con cabecera
data_cols = pd.read_csv(mainpath + "/" + "customer-churn-model/Customer Churn Columns.csv")
# Convertirlo a una lista
data_col_list = data_cols["Column_Names"].tolist()
# Quitar la cabecera original (header=None) y añadir la lista como cabecra (names=data_col_list)
data2 = pd.read_csv(mainpath + "/" + "customer-churn-model/Customer Churn Model.txt", header=None, names=data_col_list)
#data2.columns.values
data2.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,...,L,M,N,O,P,Q,R,S,T,U
0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
1,KS,128,415,382-4657,no,yes,25,265.100000,110,45.070000,...,99,16.780000,244.700000,91,11.010000,10.000000,3,2.700000,1,False.
2,OH,107,415,371-7191,no,yes,26,161.600000,123,27.470000,...,103,16.620000,254.400000,103,11.450000,13.700000,3,3.700000,1,False.
3,NJ,137,415,358-1921,no,no,0,243.400000,114,41.380000,...,110,10.300000,162.600000,104,7.320000,12.200000,5,3.290000,0,False.
4,OH,84,408,375-9999,yes,no,0,299.400000,71,50.900000,...,88,5.260000,196.900000,89,8.860000,6.600000,7,1.780000,2,False.


## Carga de datos a través de la función **open**

Leer el archivo de datos fila por fila en un loop, almacenar parte del conjunto de datos y cerrar el archivo

1. **r** solo lectura ("a" (append), posicionará el cursor al final y no borrará el contenido del mismo.)
2. **strip** se utiliza para eliminar los espacios en blanco al inicio y al final de la linea
3. **split** divide la linea de texto
4. **len(cols) es el número de columnas



In [11]:
data3 = open(mainpath + "/" + "customer-churn-model/Customer Churn Model.txt",'r')
cols = data3.readline().strip().split(",")
n_cols = len(cols)
counter = 0 #inicializar contador
main_dict = {} #definir diccionario con nombre de las columnas
for col in cols:
    main_dict[col] = [] #vacio
main_dict

{'State': [],
 'Account Length': [],
 'Area Code': [],
 'Phone': [],
 "Int'l Plan": [],
 'VMail Plan': [],
 'VMail Message': [],
 'Day Mins': [],
 'Day Calls': [],
 'Day Charge': [],
 'Eve Mins': [],
 'Eve Calls': [],
 'Eve Charge': [],
 'Night Mins': [],
 'Night Calls': [],
 'Night Charge': [],
 'Intl Mins': [],
 'Intl Calls': [],
 'Intl Charge': [],
 'CustServ Calls': [],
 'Churn?': []}

In [12]:
for line in data3: # interación sobre las filas de los datos
    values = line.strip().split(",") #quita espacios sobrantes y divide por comas
    for i in range(n_cols): #intera sobre las columnas
        main_dict[cols[i]].append(values[i]) #agrega las filas a cada columna 
    counter += 1 #cuenta cuantas filas hay
   
print("El conjunto de datos tiene %d filas y %d columnas"%(counter-1 , n_cols))

El conjunto de datos tiene 3332 filas y 21 columnas


In [13]:
df3 = pd.DataFrame(main_dict) #crear un dat frame
df3.head()

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


## Lectura y escritura archivos: tabuladores

In [14]:
infile = mainpath + "/" + "customer-churn-model/Customer Churn Model.txt"
outfile = mainpath + "/" + "customer-churn-model/Table Customer Churn Model.txt"
with open(infile, "r") as infile1:
    with open(outfile, "w") as outfile1:
        for line in infile1:
            fields = line.strip().split(",")
            outfile1.write("\t".join(fields)) #tabulador
            outfile1.write("\n") #salto de linea

In [15]:
df4 = pd.read_csv(outfile, sep = "\t")
df4.head()

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


## Leer datos desde una URL

In [16]:
medals_url = "http://winterolympicsmedals.com/medals.csv"
medals_data = pd.read_csv(medals_url)
medals_data.head()

Unnamed: 0,Year,City,Sport,Discipline,NOC,Event,Event gender,Medal
0,1924,Chamonix,Skating,Figure skating,AUT,individual,M,Silver
1,1924,Chamonix,Skating,Figure skating,AUT,individual,W,Gold
2,1924,Chamonix,Skating,Figure skating,AUT,pairs,X,Gold
3,1924,Chamonix,Bobsleigh,Bobsleigh,BEL,four-man,M,Bronze
4,1924,Chamonix,Ice Hockey,Ice Hockey,CAN,ice hockey,M,Gold


#### Usando las bibliotecas de python **csv** y **urllib3**

https://docs.python.org/3/library/csv.html

https://urllib3.readthedocs.io/en/stable/

$ conda install -c conda-forge urllib3


Usando la librería urllib3 para leer los datos desde una URL externa, procesarlos y convertirlos a un data frame de *python* antes de guardarlos en un CSV local.

In [18]:
import csv
import urllib3

In [19]:
http = urllib3.PoolManager()
r = http.request('GET', medals_url)
r.status
response = r.data

In [21]:
cr = csv.reader(response)
#El objeto reponse contiene un string binario, así que lo convertimos a un string descodificándolo en UTF-8
str_data = response.decode("utf-8")
#Dividimos el string en un array de filas, separándolo por intros
lines = str_data.split("\n")
#La primera línea contiene la cabecera, así que la extraemos
col_names = lines[0].split(",")
n_cols = len(col_names)
counter = 0 #inicializar contador
main_dict = {} #definir diccionario con nombre de las columnas
for col in col_names:
    main_dict[col] = [] #vacio
main_dict
for line in lines:
    if(counter > 0):
        values = line.strip().split(",")
        for i in range(len(col_names)):
            main_dict[col_names[i]].append(values[i]) #agrega las filas a cada columna 
    counter += 1 #cuenta cuantas filas hay
   
print("El conjunto de datos tiene %d filas y %d columnas"%(counter-1, n_cols))
df5 = pd.DataFrame(main_dict) #crear un dat frame
#df5.head()
#Elegimos donde guardarlo (en la carpeta athletes es donde tiene más sentido por el contexto del análisis)
mainpath = "/home/isadoji/Storage/Work/ml2021/mludemy/datasets/" #carpeta global
filename = "athlets/athlets" #dataset
fullpath = os.path.join(mainpath, filename)

#Lo guardamos en CSV, en JSON o en Excel según queramos
df5.to_csv(fullpath+".csv")
df5.to_json(fullpath+".json")
#df5.to_excel(fullpath+".xls")
print("Los ficheros se han guardado correctamente en: "+fullpath)
   

El conjunto de datos tiene 2311 filas y 8 columnas
Los ficheros se han guardado correctamente en: /home/isadoji/Storage/Work/ml2021/mludemy/datasets/athlets/athlets


## Archivos XLS y XLSX

In [11]:
filename = "athlets/athlets.xls" #dataset
fullpath = os.path.join(mainpath, filename)
data = pd.read_excel(fullpath)
data

ImportError: Missing optional dependency 'xlrd'. Install xlrd >= 1.0.0 for Excel support Use pip or conda to install xlrd.