# Model Iteration 1 -- Earthquake Dataset

Keenan and James

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.grid_search import GridSearchCV
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
%matplotlib inline

quakes = pd.read_csv("./downloaded/data.csv")

print quakes.head()

                       time  latitude  longitude  depth  mag magType  nst  \
0  1901-08-09T18:33:00.000Z        40        144      0  7.5      mw  NaN   
1  1901-08-09T13:01:00.000Z       -22        170      0  7.9      mw  NaN   
2  1902-04-19T02:23:00.000Z        14        -91      0  7.5      mw  NaN   
3  1902-09-23T20:18:00.000Z        16        -93      0  7.8      mw  NaN   
4  1902-09-22T01:46:00.000Z        18        146      0  7.5      mw  NaN   

   gap  dmin  rms    ...                      updated  \
0  NaN   NaN  NaN    ...     2015-05-13T18:52:41.000Z   
1  NaN   NaN  NaN    ...     2015-05-13T18:52:41.000Z   
2  NaN   NaN  NaN    ...     2015-05-13T18:52:41.000Z   
3  NaN   NaN  NaN    ...     2015-05-13T18:52:41.000Z   
4  NaN   NaN  NaN    ...     2015-05-13T18:52:41.000Z   

                                       place        type horizontalError  \
0        off the east coast of Honshu, Japan  earthquake             NaN   
1           southeast of the Loyalty Islan



In [11]:
from sklearn.cross_validation import train_test_split

def cross_validate(model, X, y, cv=3):
    return cross_validation.cross_val_score(model, X, y, cv=cv).mean()

def train_test_splitter(model, X, y, train_size=0.5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size)
    model.fit(X_train, y_train)
    return X_train, X_test, y_train, y_test, model

In [4]:
def magclassify(mag):
    if mag < 4:
        return 0
    elif mag < 5:
        return 1
    elif mag < 6:
        return 2
    elif mag < 7:
        return 3
    elif mag < 8:
        return 4
    elif mag >= 8:
        return 5
# "minor" "light" "moderate" "strong" "major" "great"

In [5]:
quakes["magClassified"] = quakes["mag"].apply(magclassify)
quakes["magClassified"]

0         4
1         4
2         4
3         4
4         4
5         4
6         4
7         4
8         4
9         4
10        4
11        3
12        4
13        5
14        5
15        4
16        3
17        5
18        4
19        2
20        3
21        5
22        3
23        5
24        5
25        4
26        4
27        4
28        3
29        3
         ..
686938    0
686939    1
686940    1
686941    0
686942    1
686943    0
686944    1
686945    1
686946    0
686947    0
686948    1
686949    0
686950    0
686951    0
686952    0
686953    1
686954    0
686955    1
686956    1
686957    1
686958    2
686959    1
686960    0
686961    1
686962    1
686963    1
686964    1
686965    0
686966    1
686967    2
Name: magClassified, dtype: int64

In [6]:
quakes.isnull().sum()

time                    0
latitude                0
longitude               0
depth                   9
mag                     0
magType               132
nst                318203
gap                321998
dmin               605675
rms                203437
net                     0
id                      0
updated                 0
place                  13
type                    0
horizontalError    609417
depthError         459109
magError           607459
magNst             304558
status                  0
locationSource          0
magSource               0
magClassified           0
dtype: int64

In [7]:
quakes["depth"] = quakes["depth"].fillna(quakes["depth"].median())

In [47]:
logistic = LogisticRegression()
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=4, min_samples_leaf=8)
predictors = ["latitude", "longitude", "depth"]
X_train, X_test, y_train, y_test = train_test_split(quakes[predictors], quakes.magClassified, train_size=.5)
alg.fit(X_train, y_train)
alg.score(X_test, y_test)

0.7834717192067171

In [15]:
predictors = ["latitude", "longitude", "depth"]
cross_validate(alg, quakes[predictors], quakes["magClassified"])

0.70110986538118569

In [17]:
quakes.magType.unique()

array(['mw', 'mh', 'ml', 'ms', 'mb', 'lg', nan, 'md', 'mc', 'mun', 'mblg',
       'me', 'm', 'mwc', 'mwb', 'mwr', 'Mw', 'mww', 'Ml', 'H', 'mlg', 'Md',
       'Mb', 'mb_lg', 'MbLg', 'ms_20', 'mlr'], dtype=object)

In [29]:
def classifyMagType(magType):
    duration = ["Md", "md"]
    local = ["ml"]
    shortPS = ["mb_lg", "lg", "mblg"]
    shortPeriodWave = ["mb"]
    twenty = ["ms", "ms_20"]
    moment = ["mi", "mwp"]
    energy = ["me"]
    
    mw = ["mw", "mwb", "mwc","mwr","mww"]
    magType = str(magType).lower()
    if magType in duration:
        return 0
    elif magType in local:
        return 1
    elif magType in shortPS:
        return 2
    elif magType in shortPeriodWave:
        return 3
    elif magType in twenty:
        return 4
    elif magType in moment:
        return 5
    elif magType in energy:
        return 6
    elif magType in mw:
        return 7
    else:
        return 8
    

In [30]:
quakes["magTypeClassified"] =  quakes["magType"].apply(classifyMagType)
quakes["magTypeClassified"].value_counts()

3    337271
1    156982
0     94543
7     48549
8     37633
2      7218
4      4756
6        16
Name: magTypeClassified, dtype: int64

In [38]:
predictors = ["latitude", "longitude", "depth", "magTypeClassified"]
X_train, X_test, y_train, y_test = train_test_split(quakes[predictors], quakes.magClassified, train_size=.5)
alg.fit(X_train, y_train)
alg.score(X_test, y_test)

0.8248768501589594

In [39]:
def parse_date(Dates):
    return int(Dates[0:4]), int(Dates[5:7]), int(Dates[8:10]), int(Dates[11:13])

In [44]:
quakes["year"] = quakes.time.apply(lambda x: parse_date(x)[0])
quakes["month"] = quakes.time.apply(lambda x: parse_date(x)[1])
quakes["hour"] = quakes.time.apply(lambda x: parse_date(x)[3])

In [48]:
predictors = ["latitude", "longitude", "depth", "magTypeClassified", "hour", "month","hour"]
X_train, X_test, y_train, y_test = train_test_split(quakes[predictors], quakes.magClassified, train_size=.5)
alg.fit(X_train, y_train)
alg.score(X_test, y_test)

0.84566966729163517

In [49]:
logistic = LogisticRegression(multi_class='multinomial', solver='newton-cg')
logistic.fit(X_train, y_train)
logistic.score(X_test, y_test)

0.72623761223230199

In [60]:
print quakes["type"].value_counts()
quakes["earthquake"] = quakes["type"].apply(lambda x: 1 if x == "earthquake" else 0)
quakes["earthquake"]

earthquake             682696
quarry blast             2355
nuclear explosion        1075
mining explosion          507
rock burst                163
explosion                 104
quarry                     35
mine collapse              20
sonic boom                 10
acoustic noise              1
anthropogenic event         1
landslide                   1
Name: type, dtype: int64


0         1
1         1
2         1
3         1
4         1
5         1
6         1
7         1
8         1
9         1
10        1
11        1
12        1
13        1
14        1
15        1
16        1
17        1
18        1
19        1
20        1
21        1
22        1
23        1
24        1
25        1
26        1
27        1
28        1
29        1
         ..
686938    1
686939    1
686940    1
686941    1
686942    1
686943    1
686944    1
686945    1
686946    1
686947    1
686948    1
686949    1
686950    1
686951    1
686952    1
686953    1
686954    1
686955    1
686956    1
686957    1
686958    1
686959    1
686960    1
686961    1
686962    1
686963    1
686964    1
686965    1
686966    1
686967    1
Name: earthquake, dtype: int64

In [61]:
predictors = ["latitude", "longitude", "depth", "magTypeClassified", "hour", "month","hour"]
X_train, X_test, y_train, y_test = train_test_split(quakes[predictors], quakes.earthquake, train_size=.5)
alg.fit(X_train, y_train)
alg.score(X_test, y_test)

0.99876267890207404

In [63]:
alg.predict([3,4,4,5,6,7,8])



array([1])