# DAY0 - Looking for Dataset + Problem 

In [2]:
# needed to make web requests
import requests

#store the data we get as a dataframe
import pandas as pd

#convert the response as a strcuctured json
import json

#mathematical operations on lists
import numpy as np

#parse the datetimes we get from NOAA
from datetime import datetime

#add the access token you got from NOAA
Token = 'xKIlBHakeOEdyBfhPkKcDKyLzjofRpNY'

#MIAMI INTERNATIONAL AIRPORT, FL US station
station_id = 'GHCND:USW00012839'

# https://www.ncdc.noaa.gov/cdo-web/datatools/findstation

In [3]:
#initialize lists to store data
dates_temp = []
dates_prcp = []
temps = []
prcp = []

#for each year from 2015-2019 ...
for year in range(2015, 2020):
    year = str(year)
    print('working on year '+year)
    
    #make the api call
    r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&datatypeid=TAVG&limit=1000&stationid=GHCND:USW00023129&startdate='+year+'-01-01&enddate='+year+'-12-31', headers={'token':Token})
    #load the api response as a json
    d = json.loads(r.text)
    #get all items in the response which are average temperature readings
    avg_temps = [item for item in d['results'] if item['datatype']=='TAVG']
    #get the date field from all average temperature readings
    dates_temp += [item['date'] for item in avg_temps]
    #get the actual average temperature from all average temperature readings
    temps += [item['value'] for item in avg_temps]

working on year 2015
working on year 2016
working on year 2017
working on year 2018
working on year 2019


In [4]:
#initialize dataframe
df_temp = pd.DataFrame()

#populate date and average temperature fields (cast string date to datetime and convert temperature from tenths of Celsius to Fahrenheit)
df_temp['date'] = [datetime.strptime(d, "%Y-%m-%dT%H:%M:%S") for d in dates_temp]
df_temp['avgTemp'] = [float(v)/10.0*1.8 + 32 for v in temps]

In [10]:
df_temp['date'].head()

0   2015-01-01
1   2015-01-02
2   2015-01-03
3   2015-01-04
4   2015-01-05
Name: date, dtype: datetime64[ns]

In [11]:
df_temp['avgTemp'].head()

0    46.58
1    48.02
2    49.82
3    52.16
4    58.64
Name: avgTemp, dtype: float64

# DAY1 - Brainstorming & Data Preparation

    Idea generation & planning
    Data gathering & cleaning
    Data storage

# Let's start with the Solar Dataset


In [35]:
import pandas as pd

solar = pd.read_csv('/Users/gracemartinez/Downloads/solar.csv')
solar.head()

Unnamed: 0,Catalog Number,Calendar Date,Eclipse Time,Delta T (s),Lunation Number,Saros Number,Eclipse Type,Gamma,Eclipse Magnitude,Latitude,Longitude,Sun Altitude,Sun Azimuth,Path Width (km),Central Duration
0,1,-1999 June 12,03:14:51,46438,-49456,5,T,-0.2701,1.0733,6.0N,33.3W,74,344,247.0,06m37s
1,2,-1999 December 5,23:45:23,46426,-49450,10,A,-0.2317,0.9382,32.9S,10.8E,76,21,236.0,06m44s
2,3,-1998 June 1,18:09:16,46415,-49444,15,T,0.4994,1.0284,46.2N,83.4E,60,151,111.0,02m15s
3,4,-1998 November 25,05:57:03,46403,-49438,20,A,-0.9045,0.9806,67.8S,143.8W,25,74,162.0,01m14s
4,5,-1997 April 22,13:19:56,46393,-49433,-13,P,-1.467,0.1611,60.6S,106.4W,0,281,,


In [36]:
solar.shape

(11898, 15)

In [37]:
solar.columns

Index(['Catalog Number', 'Calendar Date', 'Eclipse Time', 'Delta T (s)',
       'Lunation Number', 'Saros Number', 'Eclipse Type', 'Gamma',
       'Eclipse Magnitude', 'Latitude', 'Longitude', 'Sun Altitude',
       'Sun Azimuth', 'Path Width (km)', 'Central Duration'],
      dtype='object')

In [38]:
len(solar.columns)

15

In [39]:
solar.dtypes

Catalog Number         int64
Calendar Date         object
Eclipse Time          object
Delta T (s)            int64
Lunation Number        int64
Saros Number           int64
Eclipse Type          object
Gamma                float64
Eclipse Magnitude    float64
Latitude              object
Longitude             object
Sun Altitude           int64
Sun Azimuth            int64
Path Width (km)       object
Central Duration      object
dtype: object

In [None]:
# need to know what each column mean/represents to know if they're a correct type

In [40]:
solar['Eclipse Type'].value_counts()

P     3875
A     3755
T     3049
H      502
Pb     163
Pe     162
Am      72
Tm      72
An      36
A-      34
A+      34
H3      26
As      25
H2      24
T-      17
Hm      17
Tn      14
Ts      12
T+       9
Name: Eclipse Type, dtype: int64

In [32]:
# can categorize from 'H' to 'Tm' as 'Other' due to their lower frequency
# we will drop the remaining low valued types from 'An' to 'T+'

In [43]:
solar.loc[solar['Eclipse Type'].str.contains('H'), 'Eclipse Type'] = 'Other'

In [44]:
solar.loc[solar['Eclipse Type'].str.contains('Pb'), 'Eclipse Type'] = 'Other'

In [45]:
solar.loc[solar['Eclipse Type'].str.contains('Pe'), 'Eclipse Type'] = 'Other'

In [46]:
solar.loc[solar['Eclipse Type'].str.contains('Am'), 'Eclipse Type'] = 'Other'

In [47]:
solar.loc[solar['Eclipse Type'].str.contains('Tm'), 'Eclipse Type'] = 'Other'

In [48]:
solar['Eclipse Type'].value_counts()

P        3875
A        3755
T        3049
Other    1038
An         36
A+         34
A-         34
As         25
T-         17
Tn         14
Ts         12
T+          9
Name: Eclipse Type, dtype: int64

In [66]:
solar.columns = solar.columns.str.replace(' ','_')

In [67]:
solar.columns

Index(['Catalog_Number', 'Calendar_Date', 'Eclipse_Time', 'Delta_T_(s)',
       'Lunation_Number', 'Saros_Number', 'Eclipse_Type', 'Gamma',
       'Eclipse_Magnitude', 'Latitude', 'Longitude', 'Sun_Altitude',
       'Sun_Azimuth', 'Path_Width_(km)', 'Central_Duration'],
      dtype='object')

In [70]:
# 'An', 'A-', 'A+', 'H3', 'As', 'H2', 'T-', 'Hm', 'Tn', 'Ts', 'T+'

solar[solar.Eclipse_Type != 'An']

# need to remove rows containing these category types in 'Eclipse Type'

Unnamed: 0,Catalog_Number,Calendar_Date,Eclipse_Time,Delta_T_(s),Lunation_Number,Saros_Number,Eclipse_Type,Gamma,Eclipse_Magnitude,Latitude,Longitude,Sun_Altitude,Sun_Azimuth,Path_Width_(km),Central_Duration
0,1,-1999 June 12,03:14:51,46438,-49456,5,T,-0.2701,1.0733,6.0N,33.3W,74,344,247,06m37s
1,2,-1999 December 5,23:45:23,46426,-49450,10,A,-0.2317,0.9382,32.9S,10.8E,76,21,236,06m44s
2,3,-1998 June 1,18:09:16,46415,-49444,15,T,0.4994,1.0284,46.2N,83.4E,60,151,111,02m15s
3,4,-1998 November 25,05:57:03,46403,-49438,20,A,-0.9045,0.9806,67.8S,143.8W,25,74,162,01m14s
4,5,-1997 April 22,13:19:56,46393,-49433,-13,P,-1.4670,0.1611,60.6S,106.4W,0,281,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11893,11894,2998 December 10,03:18:31,4414,12355,187,P,1.2838,0.4773,67.2N,145.0E,0,179,,
11894,11895,2999 May 6,23:23:57,4417,12360,154,T,0.8388,1.0566,71.5N,177.3E,33,146,345,03m25s
11895,11896,2999 October 30,09:34:33,4420,12366,159,A-,-1.0023,0.9586,70.9S,84.7W,0,137,-,-
11896,11897,3000 April 26,14:18:06,4424,12372,164,T,0.1310,1.0222,21.1N,18.4W,82,166,76,02m11s


In [87]:
Remove_Types = ['An', 'A-', 'A+', 'H3', 'As', 'H2', 'T-', 'Hm', 'Tn', 'Ts', 'T+']

Desired_Types = [i for i in len(solar.Eclipse_Type) if i not in Remove_Types]
desired_df = solar.iloc[Desired_Types]
desired_df 

TypeError: 'int' object is not iterable

In [71]:
solar[solar.Eclipse_Type != 'A-']

Unnamed: 0,Catalog_Number,Calendar_Date,Eclipse_Time,Delta_T_(s),Lunation_Number,Saros_Number,Eclipse_Type,Gamma,Eclipse_Magnitude,Latitude,Longitude,Sun_Altitude,Sun_Azimuth,Path_Width_(km),Central_Duration
0,1,-1999 June 12,03:14:51,46438,-49456,5,T,-0.2701,1.0733,6.0N,33.3W,74,344,247,06m37s
1,2,-1999 December 5,23:45:23,46426,-49450,10,A,-0.2317,0.9382,32.9S,10.8E,76,21,236,06m44s
2,3,-1998 June 1,18:09:16,46415,-49444,15,T,0.4994,1.0284,46.2N,83.4E,60,151,111,02m15s
3,4,-1998 November 25,05:57:03,46403,-49438,20,A,-0.9045,0.9806,67.8S,143.8W,25,74,162,01m14s
4,5,-1997 April 22,13:19:56,46393,-49433,-13,P,-1.4670,0.1611,60.6S,106.4W,0,281,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11892,11893,2998 June 15,14:49:27,4410,12349,182,P,-1.0158,0.9792,66.5S,32.5W,0,9,,
11893,11894,2998 December 10,03:18:31,4414,12355,187,P,1.2838,0.4773,67.2N,145.0E,0,179,,
11894,11895,2999 May 6,23:23:57,4417,12360,154,T,0.8388,1.0566,71.5N,177.3E,33,146,345,03m25s
11896,11897,3000 April 26,14:18:06,4424,12372,164,T,0.1310,1.0222,21.1N,18.4W,82,166,76,02m11s


In [72]:
solar[solar.Eclipse_Type != 'A+']

Unnamed: 0,Catalog_Number,Calendar_Date,Eclipse_Time,Delta_T_(s),Lunation_Number,Saros_Number,Eclipse_Type,Gamma,Eclipse_Magnitude,Latitude,Longitude,Sun_Altitude,Sun_Azimuth,Path_Width_(km),Central_Duration
0,1,-1999 June 12,03:14:51,46438,-49456,5,T,-0.2701,1.0733,6.0N,33.3W,74,344,247,06m37s
1,2,-1999 December 5,23:45:23,46426,-49450,10,A,-0.2317,0.9382,32.9S,10.8E,76,21,236,06m44s
2,3,-1998 June 1,18:09:16,46415,-49444,15,T,0.4994,1.0284,46.2N,83.4E,60,151,111,02m15s
3,4,-1998 November 25,05:57:03,46403,-49438,20,A,-0.9045,0.9806,67.8S,143.8W,25,74,162,01m14s
4,5,-1997 April 22,13:19:56,46393,-49433,-13,P,-1.4670,0.1611,60.6S,106.4W,0,281,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11893,11894,2998 December 10,03:18:31,4414,12355,187,P,1.2838,0.4773,67.2N,145.0E,0,179,,
11894,11895,2999 May 6,23:23:57,4417,12360,154,T,0.8388,1.0566,71.5N,177.3E,33,146,345,03m25s
11895,11896,2999 October 30,09:34:33,4420,12366,159,A-,-1.0023,0.9586,70.9S,84.7W,0,137,-,-
11896,11897,3000 April 26,14:18:06,4424,12372,164,T,0.1310,1.0222,21.1N,18.4W,82,166,76,02m11s


In [74]:
solar[solar.Eclipse_Type != 'H3']

Unnamed: 0,Catalog_Number,Calendar_Date,Eclipse_Time,Delta_T_(s),Lunation_Number,Saros_Number,Eclipse_Type,Gamma,Eclipse_Magnitude,Latitude,Longitude,Sun_Altitude,Sun_Azimuth,Path_Width_(km),Central_Duration
0,1,-1999 June 12,03:14:51,46438,-49456,5,T,-0.2701,1.0733,6.0N,33.3W,74,344,247,06m37s
1,2,-1999 December 5,23:45:23,46426,-49450,10,A,-0.2317,0.9382,32.9S,10.8E,76,21,236,06m44s
2,3,-1998 June 1,18:09:16,46415,-49444,15,T,0.4994,1.0284,46.2N,83.4E,60,151,111,02m15s
3,4,-1998 November 25,05:57:03,46403,-49438,20,A,-0.9045,0.9806,67.8S,143.8W,25,74,162,01m14s
4,5,-1997 April 22,13:19:56,46393,-49433,-13,P,-1.4670,0.1611,60.6S,106.4W,0,281,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11893,11894,2998 December 10,03:18:31,4414,12355,187,P,1.2838,0.4773,67.2N,145.0E,0,179,,
11894,11895,2999 May 6,23:23:57,4417,12360,154,T,0.8388,1.0566,71.5N,177.3E,33,146,345,03m25s
11895,11896,2999 October 30,09:34:33,4420,12366,159,A-,-1.0023,0.9586,70.9S,84.7W,0,137,-,-
11896,11897,3000 April 26,14:18:06,4424,12372,164,T,0.1310,1.0222,21.1N,18.4W,82,166,76,02m11s


In [75]:
solar[solar.Eclipse_Type != 'As']

Unnamed: 0,Catalog_Number,Calendar_Date,Eclipse_Time,Delta_T_(s),Lunation_Number,Saros_Number,Eclipse_Type,Gamma,Eclipse_Magnitude,Latitude,Longitude,Sun_Altitude,Sun_Azimuth,Path_Width_(km),Central_Duration
0,1,-1999 June 12,03:14:51,46438,-49456,5,T,-0.2701,1.0733,6.0N,33.3W,74,344,247,06m37s
1,2,-1999 December 5,23:45:23,46426,-49450,10,A,-0.2317,0.9382,32.9S,10.8E,76,21,236,06m44s
2,3,-1998 June 1,18:09:16,46415,-49444,15,T,0.4994,1.0284,46.2N,83.4E,60,151,111,02m15s
3,4,-1998 November 25,05:57:03,46403,-49438,20,A,-0.9045,0.9806,67.8S,143.8W,25,74,162,01m14s
4,5,-1997 April 22,13:19:56,46393,-49433,-13,P,-1.4670,0.1611,60.6S,106.4W,0,281,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11893,11894,2998 December 10,03:18:31,4414,12355,187,P,1.2838,0.4773,67.2N,145.0E,0,179,,
11894,11895,2999 May 6,23:23:57,4417,12360,154,T,0.8388,1.0566,71.5N,177.3E,33,146,345,03m25s
11895,11896,2999 October 30,09:34:33,4420,12366,159,A-,-1.0023,0.9586,70.9S,84.7W,0,137,-,-
11896,11897,3000 April 26,14:18:06,4424,12372,164,T,0.1310,1.0222,21.1N,18.4W,82,166,76,02m11s


In [76]:

solar[solar.Eclipse_Type != 'H2']

Unnamed: 0,Catalog_Number,Calendar_Date,Eclipse_Time,Delta_T_(s),Lunation_Number,Saros_Number,Eclipse_Type,Gamma,Eclipse_Magnitude,Latitude,Longitude,Sun_Altitude,Sun_Azimuth,Path_Width_(km),Central_Duration
0,1,-1999 June 12,03:14:51,46438,-49456,5,T,-0.2701,1.0733,6.0N,33.3W,74,344,247,06m37s
1,2,-1999 December 5,23:45:23,46426,-49450,10,A,-0.2317,0.9382,32.9S,10.8E,76,21,236,06m44s
2,3,-1998 June 1,18:09:16,46415,-49444,15,T,0.4994,1.0284,46.2N,83.4E,60,151,111,02m15s
3,4,-1998 November 25,05:57:03,46403,-49438,20,A,-0.9045,0.9806,67.8S,143.8W,25,74,162,01m14s
4,5,-1997 April 22,13:19:56,46393,-49433,-13,P,-1.4670,0.1611,60.6S,106.4W,0,281,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11893,11894,2998 December 10,03:18:31,4414,12355,187,P,1.2838,0.4773,67.2N,145.0E,0,179,,
11894,11895,2999 May 6,23:23:57,4417,12360,154,T,0.8388,1.0566,71.5N,177.3E,33,146,345,03m25s
11895,11896,2999 October 30,09:34:33,4420,12366,159,A-,-1.0023,0.9586,70.9S,84.7W,0,137,-,-
11896,11897,3000 April 26,14:18:06,4424,12372,164,T,0.1310,1.0222,21.1N,18.4W,82,166,76,02m11s


In [77]:

solar[solar.Eclipse_Type != 'T-']

Unnamed: 0,Catalog_Number,Calendar_Date,Eclipse_Time,Delta_T_(s),Lunation_Number,Saros_Number,Eclipse_Type,Gamma,Eclipse_Magnitude,Latitude,Longitude,Sun_Altitude,Sun_Azimuth,Path_Width_(km),Central_Duration
0,1,-1999 June 12,03:14:51,46438,-49456,5,T,-0.2701,1.0733,6.0N,33.3W,74,344,247,06m37s
1,2,-1999 December 5,23:45:23,46426,-49450,10,A,-0.2317,0.9382,32.9S,10.8E,76,21,236,06m44s
2,3,-1998 June 1,18:09:16,46415,-49444,15,T,0.4994,1.0284,46.2N,83.4E,60,151,111,02m15s
3,4,-1998 November 25,05:57:03,46403,-49438,20,A,-0.9045,0.9806,67.8S,143.8W,25,74,162,01m14s
4,5,-1997 April 22,13:19:56,46393,-49433,-13,P,-1.4670,0.1611,60.6S,106.4W,0,281,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11893,11894,2998 December 10,03:18:31,4414,12355,187,P,1.2838,0.4773,67.2N,145.0E,0,179,,
11894,11895,2999 May 6,23:23:57,4417,12360,154,T,0.8388,1.0566,71.5N,177.3E,33,146,345,03m25s
11895,11896,2999 October 30,09:34:33,4420,12366,159,A-,-1.0023,0.9586,70.9S,84.7W,0,137,-,-
11896,11897,3000 April 26,14:18:06,4424,12372,164,T,0.1310,1.0222,21.1N,18.4W,82,166,76,02m11s


In [78]:

solar[solar.Eclipse_Type != 'Hm']

Unnamed: 0,Catalog_Number,Calendar_Date,Eclipse_Time,Delta_T_(s),Lunation_Number,Saros_Number,Eclipse_Type,Gamma,Eclipse_Magnitude,Latitude,Longitude,Sun_Altitude,Sun_Azimuth,Path_Width_(km),Central_Duration
0,1,-1999 June 12,03:14:51,46438,-49456,5,T,-0.2701,1.0733,6.0N,33.3W,74,344,247,06m37s
1,2,-1999 December 5,23:45:23,46426,-49450,10,A,-0.2317,0.9382,32.9S,10.8E,76,21,236,06m44s
2,3,-1998 June 1,18:09:16,46415,-49444,15,T,0.4994,1.0284,46.2N,83.4E,60,151,111,02m15s
3,4,-1998 November 25,05:57:03,46403,-49438,20,A,-0.9045,0.9806,67.8S,143.8W,25,74,162,01m14s
4,5,-1997 April 22,13:19:56,46393,-49433,-13,P,-1.4670,0.1611,60.6S,106.4W,0,281,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11893,11894,2998 December 10,03:18:31,4414,12355,187,P,1.2838,0.4773,67.2N,145.0E,0,179,,
11894,11895,2999 May 6,23:23:57,4417,12360,154,T,0.8388,1.0566,71.5N,177.3E,33,146,345,03m25s
11895,11896,2999 October 30,09:34:33,4420,12366,159,A-,-1.0023,0.9586,70.9S,84.7W,0,137,-,-
11896,11897,3000 April 26,14:18:06,4424,12372,164,T,0.1310,1.0222,21.1N,18.4W,82,166,76,02m11s


In [79]:
solar[solar.Eclipse_Type != 'Tn']

Unnamed: 0,Catalog_Number,Calendar_Date,Eclipse_Time,Delta_T_(s),Lunation_Number,Saros_Number,Eclipse_Type,Gamma,Eclipse_Magnitude,Latitude,Longitude,Sun_Altitude,Sun_Azimuth,Path_Width_(km),Central_Duration
0,1,-1999 June 12,03:14:51,46438,-49456,5,T,-0.2701,1.0733,6.0N,33.3W,74,344,247,06m37s
1,2,-1999 December 5,23:45:23,46426,-49450,10,A,-0.2317,0.9382,32.9S,10.8E,76,21,236,06m44s
2,3,-1998 June 1,18:09:16,46415,-49444,15,T,0.4994,1.0284,46.2N,83.4E,60,151,111,02m15s
3,4,-1998 November 25,05:57:03,46403,-49438,20,A,-0.9045,0.9806,67.8S,143.8W,25,74,162,01m14s
4,5,-1997 April 22,13:19:56,46393,-49433,-13,P,-1.4670,0.1611,60.6S,106.4W,0,281,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11893,11894,2998 December 10,03:18:31,4414,12355,187,P,1.2838,0.4773,67.2N,145.0E,0,179,,
11894,11895,2999 May 6,23:23:57,4417,12360,154,T,0.8388,1.0566,71.5N,177.3E,33,146,345,03m25s
11895,11896,2999 October 30,09:34:33,4420,12366,159,A-,-1.0023,0.9586,70.9S,84.7W,0,137,-,-
11896,11897,3000 April 26,14:18:06,4424,12372,164,T,0.1310,1.0222,21.1N,18.4W,82,166,76,02m11s


In [80]:
solar[solar.Eclipse_Type != 'Ts']

Unnamed: 0,Catalog_Number,Calendar_Date,Eclipse_Time,Delta_T_(s),Lunation_Number,Saros_Number,Eclipse_Type,Gamma,Eclipse_Magnitude,Latitude,Longitude,Sun_Altitude,Sun_Azimuth,Path_Width_(km),Central_Duration
0,1,-1999 June 12,03:14:51,46438,-49456,5,T,-0.2701,1.0733,6.0N,33.3W,74,344,247,06m37s
1,2,-1999 December 5,23:45:23,46426,-49450,10,A,-0.2317,0.9382,32.9S,10.8E,76,21,236,06m44s
2,3,-1998 June 1,18:09:16,46415,-49444,15,T,0.4994,1.0284,46.2N,83.4E,60,151,111,02m15s
3,4,-1998 November 25,05:57:03,46403,-49438,20,A,-0.9045,0.9806,67.8S,143.8W,25,74,162,01m14s
4,5,-1997 April 22,13:19:56,46393,-49433,-13,P,-1.4670,0.1611,60.6S,106.4W,0,281,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11893,11894,2998 December 10,03:18:31,4414,12355,187,P,1.2838,0.4773,67.2N,145.0E,0,179,,
11894,11895,2999 May 6,23:23:57,4417,12360,154,T,0.8388,1.0566,71.5N,177.3E,33,146,345,03m25s
11895,11896,2999 October 30,09:34:33,4420,12366,159,A-,-1.0023,0.9586,70.9S,84.7W,0,137,-,-
11896,11897,3000 April 26,14:18:06,4424,12372,164,T,0.1310,1.0222,21.1N,18.4W,82,166,76,02m11s


In [81]:
solar[solar.Eclipse_Type != 'T+']

Unnamed: 0,Catalog_Number,Calendar_Date,Eclipse_Time,Delta_T_(s),Lunation_Number,Saros_Number,Eclipse_Type,Gamma,Eclipse_Magnitude,Latitude,Longitude,Sun_Altitude,Sun_Azimuth,Path_Width_(km),Central_Duration
0,1,-1999 June 12,03:14:51,46438,-49456,5,T,-0.2701,1.0733,6.0N,33.3W,74,344,247,06m37s
1,2,-1999 December 5,23:45:23,46426,-49450,10,A,-0.2317,0.9382,32.9S,10.8E,76,21,236,06m44s
2,3,-1998 June 1,18:09:16,46415,-49444,15,T,0.4994,1.0284,46.2N,83.4E,60,151,111,02m15s
3,4,-1998 November 25,05:57:03,46403,-49438,20,A,-0.9045,0.9806,67.8S,143.8W,25,74,162,01m14s
4,5,-1997 April 22,13:19:56,46393,-49433,-13,P,-1.4670,0.1611,60.6S,106.4W,0,281,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11893,11894,2998 December 10,03:18:31,4414,12355,187,P,1.2838,0.4773,67.2N,145.0E,0,179,,
11894,11895,2999 May 6,23:23:57,4417,12360,154,T,0.8388,1.0566,71.5N,177.3E,33,146,345,03m25s
11895,11896,2999 October 30,09:34:33,4420,12366,159,A-,-1.0023,0.9586,70.9S,84.7W,0,137,-,-
11896,11897,3000 April 26,14:18:06,4424,12372,164,T,0.1310,1.0222,21.1N,18.4W,82,166,76,02m11s


In [83]:
solar['Eclipse_Type'].value_counts()

P        3875
A        3755
T        3049
Other    1038
An         36
A+         34
A-         34
As         25
T-         17
Tn         14
Ts         12
T+          9
Name: Eclipse_Type, dtype: int64

In [None]:
solar['Eclipse Type'].value_counts()

In [10]:
len(solar['Eclipse Type'].value_counts())

19

In [None]:
# so there would be ____ different categories of 'Eclipse Type'

In [27]:
solar.isnull().sum()

Catalog Number          0
Calendar Date           0
Eclipse Time            0
Delta T (s)             0
Lunation Number         0
Saros Number            0
Eclipse Type            0
Gamma                   0
Eclipse Magnitude       0
Latitude                0
Longitude               0
Sun Altitude            0
Sun Azimuth             0
Path Width (km)      4200
Central Duration     4200
dtype: int64

In [29]:
solar.isnull().sum() / solar.shape[0]

Catalog Number       0.000000
Calendar Date        0.000000
Eclipse Time         0.000000
Delta T (s)          0.000000
Lunation Number      0.000000
Saros Number         0.000000
Eclipse Type         0.000000
Gamma                0.000000
Eclipse Magnitude    0.000000
Latitude             0.000000
Longitude            0.000000
Sun Altitude         0.000000
Sun Azimuth          0.000000
Path Width (km)      0.353001
Central Duration     0.353001
dtype: float64

# NOW LUNAR DATASET 

In [94]:
lunar = pd.read_csv('/Users/gracemartinez/Downloads/lunar.csv')
lunar.head()

Unnamed: 0,Catalog Number,Calendar Date,Eclipse Time,Delta T (s),Lunation Number,Saros Number,Eclipse Type,Quincena Solar Eclipse,Gamma,Penumbral Magnitude,Umbral Magnitude,Latitude,Longitude,Penumbral Eclipse Duration (m),Partial Eclipse Duration (m),Total Eclipse Duration (m)
0,1,-1999 June 26,14:13:28,46437,-49456,17,N,t-,-1.0981,0.8791,-0.1922,24S,22W,268.8,-,-
1,2,-1999 November 21,20:23:49,46427,-49451,-16,N,-a,-1.1155,0.8143,-0.1921,15N,98W,233.4,-,-
2,3,-1998 May 17,05:47:36,46416,-49445,-11,P,-t,0.8988,1.2105,0.2069,13S,89E,281.7,102.7,-
3,4,-1998 November 11,05:15:58,46404,-49439,-6,P,-a,-0.4644,2.0382,0.974,12N,113E,343.4,200.8,-
4,5,-1997 May 6,18:57:01,46392,-49433,-1,T+,pp,0.1003,2.6513,1.6963,11S,92W,322.8,213.5,98.2


In [19]:
lunar.shape

(12064, 16)

In [20]:
lunar.columns

Index(['Catalog Number', 'Calendar Date', 'Eclipse Time', 'Delta T (s)',
       'Lunation Number', 'Saros Number', 'Eclipse Type',
       'Quincena Solar Eclipse', 'Gamma', 'Penumbral Magnitude',
       'Umbral Magnitude', 'Latitude', 'Longitude',
       'Penumbral Eclipse Duration (m)', 'Partial Eclipse Duration (m)',
       'Total Eclipse Duration (m)'],
      dtype='object')

In [21]:
len(lunar.columns)

16

In [22]:
lunar.dtypes

Catalog Number                      int64
Calendar Date                      object
Eclipse Time                       object
Delta T (s)                         int64
Lunation Number                     int64
Saros Number                        int64
Eclipse Type                       object
Quincena Solar Eclipse             object
Gamma                             float64
Penumbral Magnitude               float64
Umbral Magnitude                  float64
Latitude                           object
Longitude                          object
Penumbral Eclipse Duration (m)    float64
Partial Eclipse Duration (m)       object
Total Eclipse Duration (m)         object
dtype: object

In [None]:
# need to know what each column mean/represents to know if they're a correct type

In [23]:
lunar['Eclipse Type'].value_counts()

P     4207
N     4020
T     1405
T+    1042
T-    1032
Nx     141
Ne     115
Nb     102
Name: Eclipse Type, dtype: int64

In [95]:
len(lunar['Eclipse Type'].value_counts())

8

In [None]:
# can categorize 'Eclipse Type' T, T+, T- as 'Other' while removing last 3 - Nx(141), Ne(115), Nb(102) due to their low frequency

In [96]:
lunar.loc[lunar['Eclipse Type'].str.contains('T'), 'Eclipse Type'] = 'Other'

In [97]:
lunar.loc[lunar['Eclipse Type'].str.contains('T+'), 'Eclipse Type'] = 'Other'

In [98]:
lunar.loc[lunar['Eclipse Type'].str.contains('T-'), 'Eclipse Type'] = 'Other'

In [99]:
lunar['Eclipse Type'].value_counts()

P        4207
N        4020
Other    3479
Nx        141
Ne        115
Nb        102
Name: Eclipse Type, dtype: int64

In [59]:
lunar = lunar.applymap(lambda x: x if not '' in str(x) else x.replace('', 'Other'))

AttributeError: ("'int' object has no attribute 'replace'", 'occurred at index Catalog Number')

In [100]:
len(lunar['Eclipse Type'].value_counts())

6

In [None]:
# so there would be 6 different categories of 'Eclipse Type'

In [25]:
lunar.isnull().sum()

# there are no missing values

Catalog Number                    0
Calendar Date                     0
Eclipse Time                      0
Delta T (s)                       0
Lunation Number                   0
Saros Number                      0
Eclipse Type                      0
Quincena Solar Eclipse            0
Gamma                             0
Penumbral Magnitude               0
Umbral Magnitude                  0
Latitude                          0
Longitude                         0
Penumbral Eclipse Duration (m)    0
Partial Eclipse Duration (m)      0
Total Eclipse Duration (m)        0
dtype: int64

In [None]:
date = solar['Calendar Date'].str[1:].str.split()
year, month, day = date.str[0], date.str[1], date.str[2]
timeline = [pd.datetime(year=y, month=m, day=d) for y, m, d in zip(year, month, day)]