# Meteorite Landings Exercise
### Import Data

In [1]:
import pandas as pd

meteorite_data = pd.read_csv('datasets/Meteorite_Landings.csv', encoding = 'utf-8')

### Remove nametype field

In [2]:
meteorite_data = meteorite_data[[_ for _ in meteorite_data.columns if _ != 'nametype']]
meteorite_data.head(1)

Unnamed: 0,name,id,recclass,mass (g),fall,year,reclat,reclong,GeoLocation,States,Counties
0,Aachen,1,L5,21.0,Fell,1880.0,50.775,6.08333,"(50.775, 6.08333)",,


### Clean the mass field so that there's a default value of 0 where there is no mass listed

In [3]:
meteorite_data['mass (g)'] = meteorite_data['mass (g)'].fillna(0)

### Retain only data with no missing years
- I made this choice because there were only a few hundred with missing years.

In [4]:
meteorite_data = meteorite_data[(~meteorite_data.year.isna())]

### Format Date Time Information

In [5]:
import datetime as dt

def format_date(year):
    year = str(round(year))
    return dt.datetime.strptime(year.zfill(4), '%Y').year
    #return dt.datetime.strftime(new_date, '%b-%d-%Y') # I don't want any greater time detail

# Using dt.datetime
meteorite_data['new_year'] = meteorite_data.year.apply(lambda x: format_date(x))

# Using pandas datetime
meteorite_data['new_year_using_as_type'] = meteorite_data.year.astype('datetime64[ns]')

In [6]:
meteorite_data[[_ for _ in meteorite_data.columns if 'year' in _]]

Unnamed: 0,year,new_year,new_year_using_as_type
0,1880.0,1880,1970-01-01 00:00:00.000001880
1,1951.0,1951,1970-01-01 00:00:00.000001951
2,1952.0,1952,1970-01-01 00:00:00.000001952
3,1976.0,1976,1970-01-01 00:00:00.000001976
4,1902.0,1902,1970-01-01 00:00:00.000001902
...,...,...,...
45711,1990.0,1990,1970-01-01 00:00:00.000001990
45712,1999.0,1999,1970-01-01 00:00:00.000001999
45713,1939.0,1939,1970-01-01 00:00:00.000001939
45714,2003.0,2003,1970-01-01 00:00:00.000002003


### Ensure that special characters are correctly present.

In [7]:
meteorite_data.iloc[11]['name']

'Aïr'

### Store each decades data in its own excel sheet.
 - I'm storing each 100 years so I don't have 255 sheets.

In [8]:
with pd.ExcelWriter('datasets/meteorites_by_decade.xlsx') as new_file:
    years = meteorite_data.new_year.unique().tolist()
    for decade in range(round(min(years)-60), max(years), 100):
        sheet_name = f'{decade}-{decade+100}'
        subset = meteorite_data[(meteorite_data.year < decade + 100)]
        meteorite_data = meteorite_data[(meteorite_data.year >= decade + 100)]
        subset.to_excel(new_file, sheet_name = sheet_name)
    new_file.close()
    

  warn("Calling close() on already closed file.")


In [9]:
print(len(years))

265
