In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load Gold df
df_quake_gold = pd.read_csv("https://raw.githubusercontent.com/labs13-quake-viewer/ds-data/master/" +
                            "Gold%20Price%20Change%20by%20Earthquake(5.5+).csv", index_col=0)
#df_quake_gold = pd.read_csv("Gold Price Change by Earthquake(5.5+).csv", index_col=0)

df_quake_gold.shape

(23510, 17)

In [0]:
dates = []
for i in df_quake_gold.Date:
  dates.append(int(''.join(c for c in i if c.isdigit())))

In [0]:
df_quake_gold["magg"] = (df_quake_gold["Mag"] * 10).astype(int)

In [0]:
df_quake_gold["dates"] = dates

In [7]:
df_quake_gold.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23510 entries, 0 to 23509
Data columns (total 19 columns):
Date              23510 non-null object
Mag               23510 non-null float64
Lat               23510 non-null float64
Long              23510 non-null float64
Depth             23510 non-null float64
magType           23510 non-null object
Place             23510 non-null object
Type              23510 non-null object
locationSource    23510 non-null object
magSource         23510 non-null object
Price_Day_0       23510 non-null float64
Price_Day_7       23510 non-null float64
Price_Day_14      23510 non-null float64
Price_Day_30      23510 non-null float64
Appr_Day_7        23510 non-null float64
Appr_Day_14       23510 non-null float64
Appr_Day_30       23510 non-null float64
magg              23510 non-null int64
dates             23510 non-null int64
dtypes: float64(11), int64(2), object(6)
memory usage: 3.6+ MB


##Linear Regression

In [0]:
X = df_quake_gold[['dates', 'Mag', 'Lat', 'Long', 'Depth']]
y = df_quake_gold['Appr_Day_30']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
print("Original shape:", X.shape, "\n")

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Original shape: (23510, 5) 

X_train shape: (17632, 5)
X_test shape: (5878, 5)
y_train shape: (17632,)
y_test shape: (5878,)


In [12]:
model = LinearRegression()
linear_reg = model.fit(X_train, y_train)
print("Linear Regression Model score:", linear_reg.score(X_train, y_train))

Linear Regression Model score: 0.0028840265748341087


In [0]:
beta_0 = model.intercept_
beta_i = model.coef_[0]

print("Slope Coefficient: ", beta_i)
print("\nIntercept Value: ", beta_0)

print("\nCoefficients:")
for i in range(X.shape[1]):
  print(X.columns[i], '\t', model.coef_[i])

Slope Coefficient:  -1.8608298291577952e-06

Intercept Value:  37.77244218920035

Coefficients:
dates 	 -1.8608298291577952e-06
Mag 	 0.01878581001535119
Lat 	 -0.00426396411219617
Long 	 0.0005224057629747581
Depth 	 -0.0006418296858584673


In [0]:
y_test_predict = model.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_test_predict))
R2= r2_score(y_test, y_test_predict)

print('\nRMSE is {}'.format(RMSE))
print('R^2 is {}'.format(R2))


RMSE is 5.604158503260323
R^2 is 0.0009993130243650672


##Logistic Regression

In [0]:
df = df_quake_gold

In [0]:
#encode object columns
object_columns = list(df.select_dtypes(include=['object']))
df[object_columns] = df[object_columns].apply(LabelEncoder().fit_transform)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23510 entries, 0 to 23509
Data columns (total 19 columns):
Date              23510 non-null int64
Mag               23510 non-null float64
Lat               23510 non-null float64
Long              23510 non-null float64
Depth             23510 non-null float64
magType           23510 non-null int64
Place             23510 non-null int64
Type              23510 non-null int64
locationSource    23510 non-null int64
magSource         23510 non-null int64
Price_Day_0       23510 non-null float64
Price_Day_7       23510 non-null float64
Price_Day_14      23510 non-null float64
Price_Day_30      23510 non-null float64
Appr_Day_7        23510 non-null float64
Appr_Day_14       23510 non-null float64
Appr_Day_30       23510 non-null float64
magg              23510 non-null int64
dates             23510 non-null int64
dtypes: float64(11), int64(8)
memory usage: 3.6 MB
None


In [0]:
y = df['Appr_Day_30'].astype(str)
X = df[['dates', 'Mag', 'Lat', 'Long', 'Depth', 'magType', 'Place', 'Type', 'locationSource', 'magSource']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((18808, 10), (4702, 10), (18808,), (4702,))

In [0]:
X_train.sample()

Unnamed: 0,dates,Mag,Lat,Long,Depth,magType,Place,Type,locationSource,magSource
19681,20110311,6.0,35.712,140.875,47.5,1,2916,0,44,20


In [0]:
%%time
log_reg = LogisticRegression(multi_class='ovr',
                             solver='liblinear',
                             max_iter=100)
log_reg_fit = log_reg.fit(X_train, y_train)
log_reg



CPU times: user 12min 48s, sys: 129 ms, total: 12min 48s
Wall time: 12min 49s


In [0]:
log_reg

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
print("Logistic Regression Model score:", log_reg_fit.score(X_train, y_train))

Logistic Regression Model score: 0.006008081667375585


In [0]:
predictions = log_reg.predict(X_test)
print("Logistic Regression prediction accuracy:", accuracy_score(y_test, predictions))

Logistic Regression prediction accuracy: 0.005316886431305828


In [0]:
log_reg.coef_[0]

array([-4.58264125e-07, -1.31850011e-13,  1.14137698e-13,  2.88959501e-14,
       -1.89110602e-12, -1.11473208e-13, -6.67284248e-11, -4.78596057e-16,
       -9.42069102e-13, -2.90070036e-13])