## Earthquake Prediction Model

In [299]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import datetime as dt
import time


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [313]:
#load the dataset

df = pd.read_csv(r"C:\Everything On This PC\Udacity\Git_hseju\turkey-chapter-predicting-earthquakes\src\data/Omdena_Turkey__final_rev1.csv", index_col=0)

In [314]:
#check the dataframe
df.head()

Unnamed: 0,Latitude,Longitude,Depth(km),xM,MD,ML,Mw,Ms,Mb,Type,Location,City,AREA_,geometry,Date_Time,Year,Month,Hour
0,37.8,29.1,5.0,5.0,5.0,0.0,,0.0,0.0,1,DENIZLI (DENIZLI) [North East 2.3 km],DENIZLI,4621.875,POINT (29.1 37.8),1900-09-20 00:00:01.000,1900,9,0
1,37.8,29.1,20.0,4.9,4.8,4.8,4.9,4.8,4.9,1,DENIZLI (DENIZLI) [North East 2.3 km],DENIZLI,4621.875,POINT (29.1 37.8),1904-01-01 11:38:00.000,1904,1,11
2,37.8,28.7,30.0,5.5,5.2,5.2,5.5,5.3,5.2,1,HACIHIDIRLAR-KARACASU (AYDIN) [North East 2.9...,AYDIN,2866.059,POINT (28.7 37.8),1910-08-07 21:45:00.000,1910,8,21
3,37.5,29.0,15.0,4.5,4.5,0.0,,0.0,0.0,1,MEDET-TAVAS (DENIZLI) [South West 1.7 km],DENIZLI,4621.875,POINT (29 37.5),1920-07-02 14:13:01.000,1920,7,14
4,37.5,29.0,15.0,5.3,5.0,4.9,5.3,5.0,5.0,1,MEDET-TAVAS (DENIZLI) [South West 1.7 km],DENIZLI,4621.875,POINT (29 37.5),1920-07-04 12:17:58.000,1920,7,12


We would be looking at those attributes that can define the prediction of earthquake. Some of these features are Date, Time, Location and Depth. We would separate time and date into different columns to consider them as independednt features.

In [315]:
#lets check the datatypes of features
df.dtypes

Latitude     float64
Longitude    float64
Depth(km)    float64
xM           float64
MD           float64
ML           float64
Mw           float64
Ms           float64
Mb           float64
Type           int64
Location      object
City          object
AREA_        float64
geometry      object
Date_Time     object
Year           int64
Month          int64
Hour           int64
dtype: object

In [316]:
#change the datatype of Date_Time from object to datetime
df['Date_Time'] = pd.to_datetime(df['Date_Time'])

#checking the datatype
print(df['Date_Time'].dtype)

datetime64[ns]


A Unix timestamp can be more useful as it will represent the time in seconds from 1st January, 1970 to when the earthquake occured. Thus bringing everyday date and time values to a value of seconds to represent in a common scale. Now Unix timestamp only started after 1970 and will run till 2038 hence we will have to discard all the earthquake data before 1970.

In [317]:
#remove all the earthquake data from before 1970

df = df[df['Date_Time'].dt.year >=1970]

In [318]:
#reset the index to start from 0

df = df.reset_index(drop=True)

#### Convert the datetime object to unix timestamp

In [319]:
%%capture
#creat an empty list of timestamp
timestamp= []

for i in range(len(df)):
    try:
        presentDate = df['Date_Time'][i]
        print(presentDate)
        unix_timestamp = dt.datetime.timestamp(presentDate)*1000
        timestamp.append(unix_timestamp)
    except ValueError:
        # print('ValueError')
        timestamp.append('ValueError')

In [320]:
#add the timestamp column to the dataframe
df['timestamp'] = timestamp

In [321]:
df.head()

Unnamed: 0,Latitude,Longitude,Depth(km),xM,MD,ML,Mw,Ms,Mb,Type,Location,City,AREA_,geometry,Date_Time,Year,Month,Hour,timestamp
0,36.9,29.1,30.0,4.6,4.4,4.4,4.6,4.3,4.5,1,KARABAYIR-CAMELI (DENIZLI) [South West 5.0 km],DENIZLI,4621.875,POINT (29.1 36.9),1970-03-01 12:54:33.000,1970,3,12,5124273000.0
1,37.2,29.0,5.0,4.3,4.3,0.0,,0.0,0.0,1,ALPA-TAVAS (DENIZLI) [South West 4.9 km],DENIZLI,4621.875,POINT (29 37.2),1970-03-28 20:08:02.300,1970,3,20,7483082000.0
2,38.1,29.2,33.0,5.0,4.7,4.7,5.0,4.7,4.7,1,DAGMARMARA-CAL (DENIZLI) [South West 1.7 km],DENIZLI,4621.875,POINT (29.2 38.1),1970-03-28 21:23:28.000,1970,3,21,7487608000.0
3,38.2,29.9,5.0,4.4,4.4,0.0,,0.0,0.0,1,BEYKOY-CIVRIL (DENIZLI) [North East 2.5 km],DENIZLI,4621.875,POINT (29.9 38.2),1970-04-19 14:22:01.300,1970,4,14,9363121000.0
4,37.01,29.01,11.0,4.7,4.5,4.5,4.7,4.4,4.6,1,KARACAM-KOYCEGIZ (MUGLA) [South West 2.6 km],MUGLA,3786.527,POINT (29.01 37.01),1970-10-19 01:32:25.000,1970,10,1,25128140000.0


In [322]:
df.shape

(42460, 19)

In [326]:
df_final = df[df['City']=='DENIZLI']

#Now only choose the parameters to be used for modeling
df_final = df_final[['Latitude','Longitude','Depth(km)','xM','timestamp']]



#get all the columns names 
cols=[col_names for col_names in df_final.columns.tolist()]


#rearrange columns
cols = cols[0:3] + cols[4:5] + cols[3:4]

df_final = df_final[cols]
df_final.head()

Unnamed: 0,Latitude,Longitude,Depth(km),timestamp,xM
0,36.9,29.1,30.0,5124273000.0,4.6
1,37.2,29.0,5.0,7483082000.0,4.3
2,38.1,29.2,33.0,7487608000.0,5.0
3,38.2,29.9,5.0,9363121000.0,4.4
7,37.11,29.0,38.0,31467570000.0,4.3


In [327]:
df_final.shape

(2595, 5)

In [328]:
df_final.drop('timestamp', axis=1, inplace=True)

#### Scale the data before splitting into data sets

In [329]:
#create a standardscaler object
#scaler = StandardScaler()

#fit and transform the data
#df_scaled = scaler.fit_transform(df_final.to_numpy())


#### Split the data into training and testing set



In [330]:
#create feature array
#X = df_scaled[:, 0:4]
X = df_final[['Latitude','Longitude','Depth(km)']]
#create prediction label array
#y = df_scaled[:, 4]
y = df_final['xM']


In [331]:
#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1816, 3) (779, 3) (1816,) (779,)


In [332]:
X_train

Unnamed: 0,Latitude,Longitude,Depth(km)
23634,37.7988,29.6787,5.0
34,37.9000,29.4000,5.0
195,37.7800,29.2700,10.0
1981,37.0263,29.1757,5.0
384,37.2400,29.4100,1.0
...,...,...,...
1891,36.9888,29.2108,2.8
1286,37.6600,29.2500,19.0
1324,38.0400,28.9500,7.0
1502,37.8270,29.1135,18.7


## Building a model

In [350]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [368]:
#create the model object
model=DecisionTreeRegressor(random_state=4, max_depth=28)

In [369]:
#fit and predict the data
predictions =model.fit(X_train, y_train).predict(X_train)

In [370]:
#print predicitons
predictions[0:10]

array([3.2, 3.3, 3.1, 3.1, 3.4, 3. , 3.1, 3. , 3.2, 3.5])

In [371]:
y_test[0:10]

255      3.0
2410     3.0
1996     3.2
29258    3.3
1926     4.0
955      3.2
1261     3.6
29904    3.1
958      3.1
1564     3.4
Name: xM, dtype: float64

In [372]:
#calculate the r2 score
r2_score(y_train, predictions)

0.9837773437977249

In [373]:
y_pred_test = model.predict(X_test)

mean_absolute_error(y_test, y_pred_test)

0.29625588361146765

#### Random Forest Regressor

In [365]:
clf = RandomForestRegressor(n_estimators=30)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf.score(X_test, y_test)

-0.16528773245112816

In [366]:
mean_absolute_error(y_test, y_pred)

0.24801428367667128

In [367]:
y_pred_test[0]

3.1