In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Load S&P 500 df
df_quake_sp500 = pd.read_csv("https://raw.githubusercontent.com/labs13-quake-viewer/ds-data/master/" +
                             "S&P%20500%20Price%20Change%20by%20Earthquake.csv", index_col=0)
df_quake_sp500.shape

(1870, 9)

In [3]:
df_quake_sp500.head()

Unnamed: 0,Date,Mag,Price_Day_0,Price_Day_7,Price_Day_14,Price_Day_30,Appr_Day_7,Appr_Day_14,Appr_Day_30
0,1950-01-30,6.8,17.02,17.32,17.059999,17.24,1.762632,0.235012,1.292597
1,1950-02-02,6.9,17.23,17.280001,16.99,17.32,0.290197,-1.392919,0.522345
2,1950-02-03,6.7,17.290001,17.24,17.15,17.32,-0.28919,-0.809722,0.173505
3,1950-02-28,7.7,17.219999,17.200001,17.25,17.299999,-0.116132,0.174222,0.464576
4,1950-03-07,6.7,17.200001,17.25,17.450001,17.780001,0.290692,1.453488,3.372093


In [0]:
dates = []
for i in df_quake_sp500.Date:
  dates.append(int(''.join(c for c in i if c.isdigit())))

In [0]:
df_quake_sp500["magg"] = (df_quake_sp500["Mag"] * 10).astype(int)

In [0]:
df_quake_sp500["dates"] = dates

In [7]:
df_quake_sp500.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1870 entries, 0 to 1869
Data columns (total 11 columns):
Date            1870 non-null object
Mag             1870 non-null float64
Price_Day_0     1870 non-null float64
Price_Day_7     1870 non-null float64
Price_Day_14    1870 non-null float64
Price_Day_30    1870 non-null float64
Appr_Day_7      1870 non-null float64
Appr_Day_14     1870 non-null float64
Appr_Day_30     1870 non-null float64
magg            1870 non-null int64
dates           1870 non-null int64
dtypes: float64(8), int64(2), object(1)
memory usage: 175.3+ KB


In [17]:
y = df_quake_sp500['Appr_Day_30']
X = df_quake_sp500[['dates', 'Mag']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
print("Original shape:", X.shape, "\n")

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Original shape: (1870, 2) 

X_train shape: (1402, 2)
X_test shape: (468, 2)
y_train shape: (1402,)
y_test shape: (468,)


In [18]:
X_train.sample()

Unnamed: 0,dates,Mag
1286,20000328,7.6


In [0]:
# Instantiate model with 100 decision trees
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)

In [20]:
# Train model on training data
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [0]:
# Use forest's predict method on test data
predictions = rf.predict(X_test)

In [0]:
# Calculate absolute errors
errors = abs(predictions - y_test)

In [23]:
# Print out mean absolute error
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 2.69 degrees.


In [66]:
# Calculate and display accuracy
accuracy = errors.sum() / y_test.sum()
print('Predictive Accuracy:', round(accuracy, 2), '%.')

Predictive Accuracy: 4.42 %.
