In [1]:
import numpy as np
import pandas as pd
from sklearn import svm, preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
# функция, считающая среднюю точность, полноту и F-меру на основе усреднения n_iters циклов обучения

def get_metrics(df,models,model_names,n_iters):
    ans = pd.DataFrame(data=[[0,0,0] for i in range(len(models))],columns=['precision','recall','f-score'])
    ans.index = model_names
    
    for i in range(n_iters):
        X_train, X_test, y_train, y_test = train_test_split(df.drop("is_earthquake",axis=1), df["is_earthquake"],
                                                                        test_size=0.2)
        metrics = []
        for mdl in models:
                mdl.fit(X_train,y_train)
                prediction = mdl.predict(X_test).round()
                metrics.append([precision_score(prediction,y_test),recall_score(prediction,y_test),f1_score(prediction,y_test)])
        for i in range(len(models)):
            ans.iloc[i,:] += metrics[i]
            
            
    return round(ans/n_iters,2)

# Загрузка и обработка данных

In [3]:
# исходный набор данных
tec_data = pd.read_csv("../data/tec_dataframe.csv")
tec_data.head(4)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1528,1529,1530,1531,1532,1533,1534,1535,region,is_earthquake
0,28.2807,28.881,29.8803,29.5125,29.5493,29.4172,29.1814,29.0805,28.93,28.8869,...,17.5615,18.4293,18.486,18.1833,18.1794,18.7324,19.4986,19.5637,South-East Asia,1
1,21.795,21.3918,22.1461,22.2006,20.3903,20.6495,20.2798,21.2442,21.2271,20.8781,...,17.031,17.1853,18.0342,18.3596,18.6613,18.0067,18.4613,18.3256,South-East Asia,1
2,4.1026,4.181,4.1546,4.2572,4.4572,4.5542,4.7896,5.2043,6.1887,8.456,...,6.029,5.9293,5.8263,5.7003,5.961,5.7943,5.7537,5.5522,South-East Asia,1
3,64.0863,62.8963,63.9086,64.2774,64.9282,64.6949,64.8394,65.5889,66.5817,65.9062,...,58.7466,59.359,59.8167,60.5706,61.3144,59.7931,58.0222,55.8211,South-East Asia,1


In [4]:
#подмножество, состоящее из данных за 48 часов до землетрясения + информации о регионе
tec_2days = pd.DataFrame(tec_data[[str(i) for i in range (768,960)] + ['region','is_earthquake']])

In [5]:
#создаем дамми-переменные из регионов
tec_2days = pd.concat([tec_2days,pd.get_dummies(tec_2days['region'])],axis=1).drop('region',axis=1)
#стандартизация данных
tec_2days = pd.concat([pd.DataFrame(preprocessing.scale(tec_2days.iloc[:,:-9],axis=1),
                                    columns=tec_2days.columns[:-9]),tec_2days.iloc[:,-9:]],axis=1)

In [12]:
# тестируемые модели с оптимальными гиперпараметрами

models = [KNeighborsRegressor(n_neighbors=15,weights='distance'),
         DecisionTreeClassifier(criterion='gini',min_samples_split=4,max_depth=12,random_state=30),
         LogisticRegression(solver='newton-cg',random_state=2),
         svm.SVC(kernel='poly',degree=10,C=3.55)]
model_names = ['knn','random forest','logreg','svm']

# Подсчитанные метрики качества

In [14]:
get_metrics(df=tec_2days,models=models,model_names=model_names,n_iters=20)

Unnamed: 0,precision,recall,f-score
knn,0.73,0.61,0.67
random forest,0.65,0.65,0.65
logreg,0.58,0.58,0.58
svm,0.74,0.76,0.75
