In [216]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/noaa-erddap-north-pacific-high-1967-2014/erdNph_40ac_ef6e_e617.csv
/kaggle/input/groundhog-day/archive.csv


In [217]:
# Import the NOAA ERDDAP North-Pacific High Pressure Zone data from working directory
north_pacific_high = pd.read_csv('./north_pacific_high.csv')
print(north_pacific_high.columns)

#Import the Groundhog Day Forecasts and Weather data from working directory
groundhogs = pd.read_csv('./groundhogs.csv')
print(groundhogs.columns)

Index(['Unnamed: 0', 'time', 'year', 'month', 'longitude', 'latitude', 'area',
       'maxSLP'],
      dtype='object')
Index(['Unnamed: 0', 'Year', 'Punxsutawney Phil',
       'February Average Temperature',
       'February Average Temperature (Northeast)',
       'February Average Temperature (Midwest)',
       'February Average Temperature (Pennsylvania)',
       'March Average Temperature', 'March Average Temperature (Northeast)',
       'March Average Temperature (Midwest)',
       'March Average Temperature (Pennsylvania)'],
      dtype='object')


In [218]:
# Rename columns in groundhogs to match nph_months
groundhogs.rename(columns = {'Year':'year', 'Punxsutawney Phil':'punx_obs','February Average Temperature':'temp_avg_feb', 'February Average Temperature (Northeast)':'temp_avg_ne_feb', 'February Average Temperature (Midwest)':'temp_avg_mw_feb', 'February Average Temperature (Pennsylvania)':'temp_avg_pen_feb','March Average Temperature':'temp_avg_mar', 'March Average Temperature (Northeast)':'temp_avg_ne_mar', 'March Average Temperature (Midwest)':'temp_avg_mw_mar', 'March Average Temperature (Pennsylvania)':'temp_avg_pen_mar'}, inplace = True)

# Some minor feature editing
groundhogs.insert(loc=2, column='punx_preds', value='')
groundhogs['punx_obs'] = groundhogs['punx_obs'].fillna('No Record')

print(groundhogs['punx_obs'].unique())
print(groundhogs.count())


['No Record' 'Full Shadow' 'No Shadow' 'Partial Shadow']
Unnamed: 0          132
year                132
punx_preds          132
punx_obs            132
temp_avg_feb        123
temp_avg_ne_feb     123
temp_avg_mw_feb     123
temp_avg_pen_feb    123
temp_avg_mar        123
temp_avg_ne_mar     123
temp_avg_mw_mar     123
temp_avg_pen_mar    123
dtype: int64


In [219]:
groundhogs = groundhogs.loc[:, ~groundhogs.columns.str.contains('unnamed', case=False)]
groundhogs = groundhogs.iloc[:-1 , :]

In [225]:
# label_encoder object
label_encoder = LabelEncoder()

# Encode labels in column. 
groundhogs['punx_preds'] = label_encoder.fit_transform(groundhogs['punx_obs'])

groundhogs['year'] = groundhogs['year'].astype(int)
pd.to_datetime(groundhogs.year, format='%Y')
groundhogs.tail(40)
        

Unnamed: 0,year,punx_preds,punx_obs,temp_avg_feb,temp_avg_ne_feb,temp_avg_mw_feb,temp_avg_pen_feb,temp_avg_mar,temp_avg_ne_mar,temp_avg_mw_mar,temp_avg_pen_mar
91,1977,0,Full Shadow,36.03,23.2,31.5,26.1,43.18,38.4,47.4,41.2
92,1978,0,Full Shadow,27.99,15.6,20.3,17.2,41.31,28.8,37.3,31.9
93,1979,0,Full Shadow,28.13,13.9,22.9,16.4,42.06,37.0,43.7,39.4
94,1980,0,Full Shadow,32.85,20.0,26.6,23.0,39.36,31.0,38.5,33.3
95,1981,0,Full Shadow,36.59,30.3,34.1,30.7,42.85,33.0,41.7,34.1
96,1982,0,Full Shadow,32.59,22.9,30.4,26.9,42.03,31.5,43.5,34.9
97,1983,2,No Shadow,36.86,26.2,35.9,29.8,42.4,35.7,43.5,38.7
98,1984,0,Full Shadow,37.38,31.2,38.6,33.8,40.08,26.6,36.8,29.5
99,1985,0,Full Shadow,30.87,24.8,28.4,27.1,43.48,34.5,46.3,38.3
100,1986,2,No Shadow,35.85,22.3,34.3,26.7,46.18,34.7,45.0,38.0


In [221]:
# Pull January NPH data into DataFrame; drop time and month columns; drop 2014 (only exists in Jan); reset index
only_Jan = north_pacific_high[north_pacific_high['month'] == 1]
only_Jan = only_Jan.drop(["time", "month"], axis=1)
only_Jan.reset_index(drop=True, inplace=True)
only_Jan = only_Jan.drop([47], axis=0)

# Pull February NPH data into DataFrame; drop time and month columns; reset index
only_Feb = north_pacific_high[north_pacific_high['month'] == 2]
only_Feb = only_Feb.drop(["time", "month"], axis=1)
only_Feb.reset_index(drop=True, inplace=True)

# Pull Mar NPH data into DataFrame; drop time and month columns; reset index
only_Mar = north_pacific_high[north_pacific_high['month'] == 3]
only_Mar = only_Mar.drop(["time", "month"], axis=1)
only_Mar.reset_index(drop=True, inplace=True)


print(only_Jan.info())
print('-'*10)
print(only_Feb.info())
print('-'*10)
print(only_Mar.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47 entries, 0 to 46
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  47 non-null     int64  
 1   year        47 non-null     float64
 2   longitude   47 non-null     object 
 3   latitude    47 non-null     object 
 4   area        47 non-null     object 
 5   maxSLP      47 non-null     object 
dtypes: float64(1), int64(1), object(4)
memory usage: 2.6+ KB
None
----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  47 non-null     int64  
 1   year        47 non-null     float64
 2   longitude   47 non-null     object 
 3   latitude    47 non-null     object 
 4   area        47 non-null     object 
 5   maxSLP      47 non-null     object 
dtypes: float64(1), int64(1), object(4)
memory usage: 2.3+ KB


In [222]:
# Merge, arrange, and clean up NPH DataFrame
nph_months = only_Jan.merge(only_Feb, on='year', suffixes=('_jan','_feb'))
nph_months = nph_months.merge(only_Mar, on='year')

# Rename column labels joined from only_March
nph_months.rename(columns = {'longitude':'longitude_mar', 'latitude':'latitude_mar',
                              'area':'area_mar', 'maxSLP':'maxSLP_mar'}, inplace = True)

nph_months = nph_months.loc[:, ~nph_months.columns.str.contains('unnamed', case=False)]
nph_months['year'] = nph_months['year'].astype(int)
pd.to_datetime(nph_months.year, format='%Y')

print(nph_months)

    year longitude_jan latitude_jan   area_jan maxSLP_jan longitude_feb  \
0   1967         221.9         30.7  3249600.0     1022.6         219.1   
1   1968         228.7         32.9   916510.0     1021.4         233.3   
2   1969         233.1         29.7    10749.0     1016.7         210.5   
3   1970         232.3         29.1   163000.0     1020.9         233.5   
4   1971         225.7         32.7  3637500.0     1026.1         222.9   
5   1972         223.1         34.7  3903400.0     1025.8         230.1   
6   1973         226.7         27.1  1566000.0     1022.2         207.1   
7   1974         229.7         31.5   285210.0     1020.2         219.1   
8   1975         221.7         32.7  5325700.0     1027.5         227.5   
9   1976         225.1         32.9  3293800.0     1026.1         219.1   
10  1977         228.9         33.5  1015400.0     1021.1         223.5   
11  1978         233.5         25.9        0.0     1019.1         232.7   
12  1979         211.1   

In [229]:
# Create a DataFrame that merges the NPH data and the groundhog predictions for the period 1967-2013
nph_groundhogs = nph_months.merge(groundhogs, on='year')
print(nph_groundhogs.head(60))

    year longitude_jan latitude_jan   area_jan maxSLP_jan longitude_feb  \
0   1967         221.9         30.7  3249600.0     1022.6         219.1   
1   1968         228.7         32.9   916510.0     1021.4         233.3   
2   1969         233.1         29.7    10749.0     1016.7         210.5   
3   1970         232.3         29.1   163000.0     1020.9         233.5   
4   1971         225.7         32.7  3637500.0     1026.1         222.9   
5   1972         223.1         34.7  3903400.0     1025.8         230.1   
6   1973         226.7         27.1  1566000.0     1022.2         207.1   
7   1974         229.7         31.5   285210.0     1020.2         219.1   
8   1975         221.7         32.7  5325700.0     1027.5         227.5   
9   1976         225.1         32.9  3293800.0     1026.1         219.1   
10  1977         228.9         33.5  1015400.0     1021.1         223.5   
11  1978         233.5         25.9        0.0     1019.1         232.7   
12  1979         211.1   