# Importing and Exporting Data

Importing data from a file into a DF

## Reading Files

In [1]:
import pandas as pd

filepath = 'csv_files/sunspots.csv'

df_sunspots = pd.read_csv(filepath, sep=';')

df_sunspots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73199 entries, 0 to 73198
Data columns (total 8 columns):
1818        73199 non-null int64
01          73199 non-null int64
01.1        73199 non-null int64
1818.001    73199 non-null float64
  -1        73199 non-null int64
 -1.0       73199 non-null float64
   0        73199 non-null int64
1           73199 non-null int64
dtypes: float64(2), int64(6)
memory usage: 4.5 MB


In [15]:
df_sunspots.iloc[10:20, :]

Unnamed: 0,1818,01,01.1,1818.001,-1,-1.0,0,1
10,1818,1,12,1818.032,-1,-1.0,0,1
11,1818,1,13,1818.034,37,7.7,1,1
12,1818,1,14,1818.037,-1,-1.0,0,1
13,1818,1,15,1818.04,-1,-1.0,0,1
14,1818,1,16,1818.042,-1,-1.0,0,1
15,1818,1,17,1818.045,77,11.1,1,1
16,1818,1,18,1818.048,98,12.6,1,1
17,1818,1,19,1818.051,105,13.0,1,1
18,1818,1,20,1818.053,-1,-1.0,0,1
19,1818,1,21,1818.056,-1,-1.0,0,1


The sunspot DF has some problems. Column names are making no sense and tehre are some negative values in certain columns.

In [31]:
col_names = ['year', 'month', 'day', 'frac_date', 'sunspots', 'daily_stdv', 'no_of_obs', 'def_prov_indic']

# Notice how NaN values can be defined separately for each column
df_sunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names, na_values={'sunspots':['  -1'],
                          'daily_stdv':['-1.0']}, parse_dates=[[0, 1, 2]])

df_sunspots.iloc[10:20, :]

Unnamed: 0,year_month_day,frac_date,sunspots,daily_stdv,no_of_obs,def_prov_indic
10,1818-01-11,1818.029,,,0,1
11,1818-01-12,1818.032,,,0,1
12,1818-01-13,1818.034,37.0,7.7,1,1
13,1818-01-14,1818.037,,,0,1
14,1818-01-15,1818.04,,,0,1
15,1818-01-16,1818.042,,,0,1
16,1818-01-17,1818.045,77.0,11.1,1,1
17,1818-01-18,1818.048,98.0,12.6,1,1
18,1818-01-19,1818.051,105.0,13.0,1,1
19,1818-01-20,1818.053,,,0,1


In [34]:
df_sunspots.info()

# 'year, 'month' and 'day' columns are melted together with parse_dates argument, which takes a list of 
# lists as arguments and changes their data type to datetime64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73200 entries, 0 to 73199
Data columns (total 6 columns):
year_month_day    73200 non-null datetime64[ns]
frac_date         73200 non-null float64
sunspots          69953 non-null float64
daily_stdv        69953 non-null float64
no_of_obs         73200 non-null int64
def_prov_indic    73200 non-null int64
dtypes: datetime64[ns](1), float64(3), int64(2)
memory usage: 3.4 MB


We can change the indexing of the DF by putting the date as a index

In [35]:
df_sunspots.index = df_sunspots['year_month_day']

df_sunspots.index.name = 'date'

df_sunspots.iloc[10:20, :]

Unnamed: 0_level_0,year_month_day,frac_date,sunspots,daily_stdv,no_of_obs,def_prov_indic
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1818-01-11,1818-01-11,1818.029,,,0,1
1818-01-12,1818-01-12,1818.032,,,0,1
1818-01-13,1818-01-13,1818.034,37.0,7.7,1,1
1818-01-14,1818-01-14,1818.037,,,0,1
1818-01-15,1818-01-15,1818.04,,,0,1
1818-01-16,1818-01-16,1818.042,,,0,1
1818-01-17,1818-01-17,1818.045,77.0,11.1,1,1
1818-01-18,1818-01-18,1818.048,98.0,12.6,1,1
1818-01-19,1818-01-19,1818.051,105.0,13.0,1,1
1818-01-20,1818-01-20,1818.053,,,0,1


In [36]:
df_sunspots.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 73200 entries, 1818-01-01 to 2018-05-31
Data columns (total 6 columns):
year_month_day    73200 non-null datetime64[ns]
frac_date         73200 non-null float64
sunspots          69953 non-null float64
daily_stdv        69953 non-null float64
no_of_obs         73200 non-null int64
def_prov_indic    73200 non-null int64
dtypes: datetime64[ns](1), float64(3), int64(2)
memory usage: 3.9 MB


We have too many columns with dates, lets extract the meaningful columns:

In [37]:
cols = ['sunspots', 'def_prov_indic']

df_sunspots = df_sunspots[cols]

df_sunspots.iloc[10:20, :]

Unnamed: 0_level_0,sunspots,def_prov_indic
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1818-01-11,,1
1818-01-12,,1
1818-01-13,37.0,1
1818-01-14,,1
1818-01-15,,1
1818-01-16,,1
1818-01-17,77.0,1
1818-01-18,98.0,1
1818-01-19,105.0,1
1818-01-20,,1


## Writing Files

In [None]:
out_csv = 'sunspots_cleaned.csv'

df_sunspots.to_csv(out_csv)

out_tsv = 'sunspots_cleaned.tsv'

df_sunspots.to_csv(out_tsv, sep='\t')

out_xlsx = 'sunspots_cleaned.xlsx'

df_sunspots.to_excel(out_xlsx)