In [1]:
# Importaciones necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

In [2]:
# Echemos un vistazo al fichero
!cat "Ficheros/Manchas/SN_d_tot_V2.0.csv" | head

1818;01;01;1818.001;  -1; -1.0;   0;1
1818;01;02;1818.004;  -1; -1.0;   0;1
1818;01;03;1818.007;  -1; -1.0;   0;1
1818;01;04;1818.010;  -1; -1.0;   0;1
1818;01;05;1818.012;  -1; -1.0;   0;1
1818;01;06;1818.015;  -1; -1.0;   0;1
1818;01;07;1818.018;  -1; -1.0;   0;1
1818;01;08;1818.021;  65; 10.2;   1;1
1818;01;09;1818.023;  -1; -1.0;   0;1
1818;01;10;1818.026;  -1; -1.0;   0;1
cat: write error: Broken pipe


In [3]:
# Con la descripción de los datos que tenemos (ver "SN_d_tot_V2.0_description.txt") y 
# sabiendo que los NaN se coresponden con el valor -1 (y que también tenemos 0) podemos ya cargarlo:
columns = ['Year', 'Month', 'Day', 'Fraction', 'Spots', 'Stdev', 'Observ', 'Def/Prov']
df = pd.read_csv('Ficheros/Manchas/SN_d_tot_V2.0.csv', sep = ';', header = None, names = columns, na_values = [0, -1])
df

Unnamed: 0,Year,Month,Day,Fraction,Spots,Stdev,Observ,Def/Prov
0,1818,1,1,1818.001,-1,,0,1.0
1,1818,1,2,1818.004,-1,,0,1.0
2,1818,1,3,1818.007,-1,,0,1.0
3,1818,1,4,1818.010,-1,,0,1.0
4,1818,1,5,1818.012,-1,,0,1.0
...,...,...,...,...,...,...,...,...
73834,2020,2,25,2020.152,0,,25,
73835,2020,2,26,2020.154,0,,30,
73836,2020,2,27,2020.157,0,,28,
73837,2020,2,28,2020.160,0,,32,


In [4]:
# No parece que nos haya transformado en NaN los datos de 'Spots', debe haber espacios en esa columna,
# vamos a quitarlos
columns = ['Year', 'Month', 'Day', 'Fraction', 'Spots', 'Stdev', 'Observ', 'Def/Prov']
df = pd.read_csv('Ficheros/Manchas/SN_d_tot_V2.0.csv', sep = r'\s*;\s*', header = None, names = columns, na_values =[0, -1],
                engine = 'python')
df

Unnamed: 0,Year,Month,Day,Fraction,Spots,Stdev,Observ,Def/Prov
0,1818,1,1,1818.001,,,,1.0
1,1818,1,2,1818.004,,,,1.0
2,1818,1,3,1818.007,,,,1.0
3,1818,1,4,1818.010,,,,1.0
4,1818,1,5,1818.012,,,,1.0
...,...,...,...,...,...,...,...,...
73834,2020,2,25,2020.152,,,25.0,
73835,2020,2,26,2020.154,,,30.0,
73836,2020,2,27,2020.157,,,28.0,
73837,2020,2,28,2020.160,,,32.0,


In [5]:
# Un poco de información
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73839 entries, 0 to 73838
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Year      73839 non-null  int64  
 1   Month     73839 non-null  int64  
 2   Day       73839 non-null  int64  
 3   Fraction  73839 non-null  float64
 4   Spots     59396 non-null  float64
 5   Stdev     67975 non-null  float64
 6   Observ    70592 non-null  float64
 7   Def/Prov  73687 non-null  float64
dtypes: float64(5), int64(3)
memory usage: 4.5 MB
None
               Year         Month           Day      Fraction         Spots  \
count  73839.000000  73839.000000  73839.000000  73839.000000  59396.000000   
mean    1918.582389      6.518926     15.729195   1919.082049     98.416880   
std       58.360030      3.450319      8.799946     58.360155     74.550474   
min     1818.000000      1.000000      1.000000   1818.001000      1.000000   
25%     1868.000000      4.000000      8.000000   18

In [6]:
# Controlamos los NaN
df.isna().sum()


Year            0
Month           0
Day             0
Fraction        0
Spots       14443
Stdev        5864
Observ       3247
Def/Prov      152
dtype: int64

In [7]:
# Lo que nos interesa está en la columna 'Spots', eliminamos filas con NaN en dicha columna
df = df.dropna(axis = 0, subset = ['Spots'])

In [8]:
# Crearemos una columna en formato fecha para operaciones posteriores
df['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day']])

In [9]:
# Tomaremos como outliers la media + 3Stdev (~325)
len(df[df['Spots'] > 325])

571

In [10]:
# Los borramos
df.drop(df[df.Spots > 325].index, inplace =True)

In [11]:
# Solo nos interesan la nueva columna de fecha y el número de manchas (Spots). Eliminamos el resto
df.drop(['Year', 'Month', 'Day', 'Fraction', 'Stdev', 'Observ', 'Def/Prov'], axis = 'columns', inplace = True)

In [12]:
# Reordenamos
df = df[['Date', 'Spots']]

In [13]:
# Listo! Lo guardamos 
df.to_csv('Spots_new.csv', index = False)