In [34]:
# Importaciones necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_columns = None

In [35]:
# Echemos un vistazo preliminar al fichero
!cat "Ficheros/Manchas/SN_d_tot_V2.0.csv" | head

1818;01;01;1818.001;  -1; -1.0;   0;1
1818;01;02;1818.004;  -1; -1.0;   0;1
1818;01;03;1818.007;  -1; -1.0;   0;1
1818;01;04;1818.010;  -1; -1.0;   0;1
1818;01;05;1818.012;  -1; -1.0;   0;1
1818;01;06;1818.015;  -1; -1.0;   0;1
1818;01;07;1818.018;  -1; -1.0;   0;1
1818;01;08;1818.021;  65; 10.2;   1;1
1818;01;09;1818.023;  -1; -1.0;   0;1
1818;01;10;1818.026;  -1; -1.0;   0;1
cat: write error: Broken pipe


In [36]:
# Con la descripción de los datos que tenemos (ver "SN_d_tot_V2.0_description.txt") y 
# sabiendo que los NaN se coresponden con el valor -1 podemos ya cargarlo:
columnas = ['Year', 'Month', 'Day', 'Fraction', 'Spots', 'Stdev', 'Observ', 'Def/Prov']
df = pd.read_csv('Ficheros/Manchas/SN_d_tot_V2.0.csv', sep = ';', header = None, names = columnas, na_values = [-1])
df.head(10)

Unnamed: 0,Year,Month,Day,Fraction,Spots,Stdev,Observ,Def/Prov
0,1818,1,1,1818.001,-1,,0,1
1,1818,1,2,1818.004,-1,,0,1
2,1818,1,3,1818.007,-1,,0,1
3,1818,1,4,1818.01,-1,,0,1
4,1818,1,5,1818.012,-1,,0,1
5,1818,1,6,1818.015,-1,,0,1
6,1818,1,7,1818.018,-1,,0,1
7,1818,1,8,1818.021,65,10.2,1,1
8,1818,1,9,1818.023,-1,,0,1
9,1818,1,10,1818.026,-1,,0,1


In [37]:
# No parece que nos haya transformado en NaN los datos de 'Spots', debe haber espacios en esa columna
columnas = ['Year', 'Month', 'Day', 'Fraction', 'Spots', 'Stdev', 'Observ', 'Def/Prov']
df = pd.read_csv('Ficheros/Manchas/SN_d_tot_V2.0.csv', sep = r'\s*;\s*', header = None, names = columnas, na_values =[-1],
                engine = 'python')
df.head(10)

Unnamed: 0,Year,Month,Day,Fraction,Spots,Stdev,Observ,Def/Prov
0,1818,1,1,1818.001,,,0,1
1,1818,1,2,1818.004,,,0,1
2,1818,1,3,1818.007,,,0,1
3,1818,1,4,1818.01,,,0,1
4,1818,1,5,1818.012,,,0,1
5,1818,1,6,1818.015,,,0,1
6,1818,1,7,1818.018,,,0,1
7,1818,1,8,1818.021,65.0,10.2,1,1
8,1818,1,9,1818.023,,,0,1
9,1818,1,10,1818.026,,,0,1


In [38]:
# Ahora si... Contemos registros
df.count

<bound method DataFrame.count of        Year  Month  Day  Fraction  Spots  Stdev  Observ  Def/Prov
0      1818      1    1  1818.001    NaN    NaN       0         1
1      1818      1    2  1818.004    NaN    NaN       0         1
2      1818      1    3  1818.007    NaN    NaN       0         1
3      1818      1    4  1818.010    NaN    NaN       0         1
4      1818      1    5  1818.012    NaN    NaN       0         1
...     ...    ...  ...       ...    ...    ...     ...       ...
73834  2020      2   25  2020.152    0.0    0.0      25         0
73835  2020      2   26  2020.154    0.0    0.0      30         0
73836  2020      2   27  2020.157    0.0    0.0      28         0
73837  2020      2   28  2020.160    0.0    0.0      32         0
73838  2020      2   29  2020.163    0.0    0.0      23         0

[73839 rows x 8 columns]>

In [39]:
# Vemos el número de NaN y hasta donde llegan
df.isna().sum()
df[df.isnull().any(1)]

Unnamed: 0,Year,Month,Day,Fraction,Spots,Stdev,Observ,Def/Prov
0,1818,1,1,1818.001,,,0,1
1,1818,1,2,1818.004,,,0,1
2,1818,1,3,1818.007,,,0,1
3,1818,1,4,1818.010,,,0,1
4,1818,1,5,1818.012,,,0,1
...,...,...,...,...,...,...,...,...
11296,1848,12,5,1848.928,,,0,1
11306,1848,12,15,1848.955,,,0,1
11307,1848,12,16,1848.958,,,0,1
11310,1848,12,19,1848.966,,,0,1


In [18]:
# Son relativamente pocas filas y corresponden a años lejanos, las eliminamos sin más
df.dropna(inplace = True)

In [19]:
# Solo nos interesan las columnas de fecha (Year, Month, Day) y el número de manchas (Spots). Eliminamos el resto
df.drop(['Fraction', 'Stdev', 'Observ', 'Def/Prov'], axis = 'columns', inplace = True)

In [20]:
# Crearemos una columna en formato fecha para operaciones posteriores
df['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day']])

In [21]:
# Listo! Lo guardamos para más tarde
df.to_csv('Spots_new.csv', index = False)