In [1]:
import pandas as pd
import numpy as np

### Clean data 
Estas primeras celdas son encargadas de limpiar el dataset. Las variables que vamos a arreglar o quitar son las siguientes

    - Longitud
    - Latitud
    - Central Duration
    - Path Width (km)
    - Calendar Date
    - Eclipse Time
    - Catalog Number
    
#### Solar eclipses

In [2]:
data_folder = "./data"
solar_eclipses = pd.read_csv(data_folder + "/solar.csv")

In [3]:
## Se elimina la primera columna por lo que es un identificador que no aporta al dataset información
solar_eclipses.drop(['Catalog Number'], axis=1, inplace=True)

In [4]:
def convert_longitude_to_numerical(longitude):
    """
    Convierte la longitud a grados entre [-180°,180°] 
    """
    degrees = longitude[:-1]
    word = longitude[-1]
    degrees = float(degrees)
    if word == "W":
        return degrees*-1
    return degrees

solar_eclipses['Longitude'] = solar_eclipses['Longitude'].apply(lambda longitud: convert_longitude_to_numerical(longitud))

In [5]:
def convert_latitude_to_numerical(latitude):
    """
    Convierte la latitud a grados entre [-180°,180°] 
    """
    degrees = latitude[:-1]
    word = latitude[-1]
    degrees = float(degrees)
    if word == "S":
        return degrees*-1
    return degrees

solar_eclipses['Latitude'] = solar_eclipses['Latitude'].apply(lambda latitude: convert_latitude_to_numerical(latitude))

In [6]:
def convert_central_duration_to_seconds(central_duration):
    """
    Convertir la variable duración central a segundos
    """
    if type(central_duration) == float or central_duration == "-":
        return None
    minutes_index = central_duration.index("m")
    minutes = int(central_duration[:minutes_index])
    return (minutes*60) + int(central_duration[minutes_index + 1:-1])

solar_eclipses['Central Duration'] = solar_eclipses['Central Duration'].apply(lambda time: convert_central_duration_to_seconds(time))

In [7]:
def fix_path_width(path_width):
    """
    Convertir la variable path width a número o None
    """
    if path_width == None:
        return None
    elif path_width == "-":
        return None
    return float(path_width)

solar_eclipses['Path Width (km)'] = solar_eclipses['Path Width (km)'].apply(lambda path_width: fix_path_width(path_width))

In [8]:
solar_eclipses.head()

Unnamed: 0,Calendar Date,Eclipse Time,Delta T (s),Lunation Number,Saros Number,Eclipse Type,Gamma,Eclipse Magnitude,Latitude,Longitude,Sun Altitude,Sun Azimuth,Path Width (km),Central Duration
0,-1999 June 12,03:14:51,46438,-49456,5,T,-0.2701,1.0733,6.0,-33.3,74,344,247.0,397.0
1,-1999 December 5,23:45:23,46426,-49450,10,A,-0.2317,0.9382,-32.9,10.8,76,21,236.0,404.0
2,-1998 June 1,18:09:16,46415,-49444,15,T,0.4994,1.0284,46.2,83.4,60,151,111.0,135.0
3,-1998 November 25,05:57:03,46403,-49438,20,A,-0.9045,0.9806,-67.8,-143.8,25,74,162.0,74.0
4,-1997 April 22,13:19:56,46393,-49433,-13,P,-1.467,0.1611,-60.6,-106.4,0,281,,


In [9]:
solar_eclipses.to_csv(data_folder + "/solar_clean.csv", index=False)