# Model Iteration 1 -- Earthquake Dataset

Keenan and James

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV

%matplotlib inline

quakes = pd.read_csv("./data.csv")

print quakes.head()

IOError: File ./downloaded/data.csv does not exist

We wrote some helper functions that will help us test the progress we have made in the models quickly.

In [None]:
from sklearn.cross_validation import train_test_split

def cross_validate(model, X, y, cv=3):
    return cross_validation.cross_val_score(model, X, y, cv=cv).mean()

def train_test_splitter(model, X, y, train_size=0.5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size)
    model.fit(X_train, y_train)
    return X_train, X_test, y_train, y_test, model

### Feature engineering

Because our magnitude is a continuous variable, it makes it hard to predict. We decided to categorize them into standard scales provided by USGS [here](http://www.geo.mtu.edu/UPSeis/magnitude.html). We thought this categorization would simplify the problem and makes sense for making predictions because it will be very hard to give an exact magnitude prediction.

In [None]:
def magclassify(mag):
    if mag < 4:
        return 0
    elif mag < 5:
        return 1
    elif mag < 6:
        return 2
    elif mag < 7:
        return 3
    elif mag < 8:
        return 4
    elif mag >= 8:
        return 5
# "minor" "light" "moderate" "strong" "major" "great"

In [None]:
quakes["magClassified"] = quakes["mag"].apply(magclassify)
quakes["magClassified"]

In [None]:
quakes.isnull().sum()

We found that there were sum null values in the depth so we decided to fill it with the median since we only have 9 points missing.

In [None]:
quakes["depth"] = quakes["depth"].fillna(quakes["depth"].median())

We used two simple models of logisticRegression and RandomForestClassifier to start our training.

In [None]:
logistic = LogisticRegression()
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=4, min_samples_leaf=8)
predictors = ["latitude", "longitude", "depth"]
X_train, X_test, y_train, y_test = train_test_split(quakes[predictors], quakes.magClassified, train_size=.5)
alg.fit(X_train, y_train)
alg.score(X_test, y_test)

In [None]:
predictors = ["latitude", "longitude", "depth"]
cross_validate(alg, quakes[predictors], quakes["magClassified"])

Another feature we thought would be interesting was classifying magnitude type. We were able to classify the encoding with the following look up there [here](http://earthquake.usgs.gov/earthquakes/eventpage/terms.php).

In [None]:
quakes.magType.unique()

In [None]:
def classifyMagType(magType):
    duration = ["Md", "md"]
    local = ["ml"]
    shortPS = ["mb_lg", "lg", "mblg"]
    shortPeriodWave = ["mb"]
    twenty = ["ms", "ms_20"]
    moment = ["mi", "mwp"]
    energy = ["me"]
    
    mw = ["mw", "mwb", "mwc","mwr","mww"]
    magType = str(magType).lower()
    if magType in duration:
        return 0
    elif magType in local:
        return 1
    elif magType in shortPS:
        return 2
    elif magType in shortPeriodWave:
        return 3
    elif magType in twenty:
        return 4
    elif magType in moment:
        return 5
    elif magType in energy:
        return 6
    elif magType in mw:
        return 7
    else:
        return 8
    

In [None]:
quakes["magTypeClassified"] =  quakes["magType"].apply(classifyMagType)
quakes["magTypeClassified"].value_counts()

We were able to increase our score by 4 percent by adding magTypeClassified

In [None]:
predictors = ["latitude", "longitude", "depth", "magTypeClassified"]
X_train, X_test, y_train, y_test = train_test_split(quakes[predictors], quakes.magClassified, train_size=.5)
alg.fit(X_train, y_train)
alg.score(X_test, y_test)

Time is definitely an important variable for our model so we decided to add year month and hour column to see if it will help our model predict better.

In [None]:
def parse_date(Dates):
    return int(Dates[0:4]), int(Dates[5:7]), int(Dates[8:10]), int(Dates[11:13])

In [None]:
quakes["year"] = quakes.time.apply(lambda x: parse_date(x)[0])
quakes["month"] = quakes.time.apply(lambda x: parse_date(x)[1])
quakes["hour"] = quakes.time.apply(lambda x: parse_date(x)[3])

Adding time variables increased our performance by 3 percent

In [None]:
predictors = ["latitude", "longitude", "depth", "magTypeClassified", "year", "month","hour"]
X_train, X_test, y_train, y_test = train_test_split(quakes[predictors], quakes.magClassified, train_size=.5)
alg.fit(X_train, y_train)
alg.score(X_test, y_test)

In [None]:
from geopy.geocoders import Nominatim

geolocator = Nominatim()
# location = geolocator.reverse("-6.0326, 103.7117")
location = geolocator.reverse("42.2926850,-71.2644100")
print location.address

In [None]:
quakes["fault"] = quakes.place.apply(lambda x: True if "Ridge" in str(x).split() else False)
quakes.fault.value_counts()
