In [128]:
import os
import pandas as pd
import numpy as np
import datetime
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn import model_selection, linear_model, metrics
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier

plt.rcParams.update({'figure.max_open_warning': 0})
%matplotlib inline

path_to_data_dir = os.getcwd() + '/' + 'data'

for dirname, _, filenames in os.walk(path_to_data_dir):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/home/irina/PycharmProjects/ML_SERVICE/data/train_data.csv
/home/irina/PycharmProjects/ML_SERVICE/data/test_data.csv
/home/irina/PycharmProjects/ML_SERVICE/data/sample_submission.csv


In [129]:
path_to_train_data = path_to_data_dir + '/train_data.csv'
path_to_test_data = path_to_data_dir + '/test_data.csv'

In [130]:
train = pd.read_csv(path_to_train_data, index_col = "datetime")
test = pd.read_csv(path_to_test_data, index_col = "datetime")

In [131]:
train

Unnamed: 0_level_0,Accelerometer1RMS,Accelerometer2RMS,Current,Pressure,Temperature,Thermocouple,Voltage,Volume Flow RateRMS,anomaly,changepoint
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-02-08 13:30:47,0.202394,0.275154,2.16975,0.382638,90.6454,26.8508,238.852,122.6640,0.0,0.0
2020-02-08 13:30:48,0.203153,0.277857,2.07999,-0.273216,90.7978,26.8639,227.943,122.3380,0.0,0.0
2020-02-08 13:30:50,0.202054,0.275790,2.52577,0.382638,90.7730,26.8603,223.486,121.3380,0.0,0.0
2020-02-08 13:30:51,0.203595,0.278101,2.49742,0.054711,90.8424,26.8616,244.904,121.6640,0.0,0.0
2020-02-08 13:30:52,0.201889,0.276363,2.29194,0.710565,90.6664,26.8603,239.196,122.0000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2020-03-09 12:01:19,0.027307,0.041533,1.22669,0.054711,70.4256,25.1891,225.232,32.9913,0.0,0.0
2020-03-09 12:01:20,0.026487,0.040581,1.09368,0.054711,70.3100,25.1926,232.083,32.0000,0.0,0.0
2020-03-09 12:01:21,0.027776,0.040970,1.32421,0.382638,70.8821,25.1826,246.925,32.0000,0.0,0.0
2020-03-09 12:01:22,0.027350,0.041147,1.19984,0.054711,70.7591,25.1899,233.400,31.9913,0.0,0.0


In [132]:
train.shape

(29283, 10)

In [133]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29283 entries, 2020-02-08 13:30:47 to 2020-03-09 12:01:23
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Accelerometer1RMS    29283 non-null  float64
 1   Accelerometer2RMS    29283 non-null  float64
 2   Current              29283 non-null  float64
 3   Pressure             29283 non-null  float64
 4   Temperature          29283 non-null  float64
 5   Thermocouple         29283 non-null  float64
 6   Voltage              29283 non-null  float64
 7   Volume Flow RateRMS  29283 non-null  float64
 8   anomaly              29283 non-null  float64
 9   changepoint          29283 non-null  float64
dtypes: float64(10)
memory usage: 2.5+ MB


In [134]:
train.describe()

Unnamed: 0,Accelerometer1RMS,Accelerometer2RMS,Current,Pressure,Temperature,Thermocouple,Voltage,Volume Flow RateRMS,anomaly,changepoint
count,29283.0,29283.0,29283.0,29283.0,29283.0,29283.0,29283.0,29283.0,29283.0,29283.0
mean,0.184486,0.229066,2.018234,0.091778,84.987274,27.594279,229.001562,99.157951,0.243623,0.002356
std,0.128585,0.142528,0.736483,0.257505,6.555196,2.638584,10.965779,39.629194,0.429275,0.048486
min,0.015752,0.015505,0.149842,-1.257,67.9737,22.0209,200.744,0.555512,0.0,0.0
25%,0.080676,0.131005,1.32709,0.054711,85.13615,25.65815,222.9735,74.9789,0.0,0.0
50%,0.21268,0.265831,2.16485,0.054711,87.9532,28.6975,229.179,125.329,0.0,0.0
75%,0.22759,0.275667,2.653845,0.382638,89.0589,29.4038,235.317,126.681,0.0,0.0
max,0.722747,0.800498,3.31837,1.36642,95.0114,33.4151,255.324,133.688,1.0,1.0


In [135]:
test

Unnamed: 0_level_0,Accelerometer1RMS,Accelerometer2RMS,Current,Pressure,Temperature,Thermocouple,Voltage,Volume Flow RateRMS
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-03-09 14:09:56,0.027808,0.039929,1.211780,0.054711,68.7436,24.7123,235.085,27.9770
2020-03-09 14:09:57,0.027856,0.040478,0.705363,0.054711,68.7862,24.7063,233.554,27.0239
2020-03-09 14:09:58,0.027570,0.040893,0.720498,-0.601143,68.8452,24.7131,212.624,27.9770
2020-03-09 14:09:59,0.027512,0.039550,1.098810,0.054711,68.5868,24.7123,213.072,27.0239
2020-03-09 14:10:00,0.027432,0.039926,1.365020,-0.273216,68.7589,24.7075,243.961,27.9770
...,...,...,...,...,...,...,...,...
2020-03-09 17:14:04,0.026853,0.038926,0.740614,0.054711,69.6371,24.1045,237.276,32.0451
2020-03-09 17:14:05,0.027067,0.038430,0.988875,0.054711,69.6731,24.1046,230.729,32.9562
2020-03-09 17:14:07,0.027582,0.038836,0.588439,0.054711,69.6959,24.1020,233.443,32.0000
2020-03-09 17:14:08,0.027406,0.038133,0.989732,-0.273216,69.6293,24.1020,238.930,32.0000


In [136]:
X_train,y_train = train.iloc[:,:-2], train.iloc[:,-2]

In [240]:
X_train['Volume Flow RateRMS nean'] = X_train['Volume Flow RateRMS'].rolling(window = 10, min_periods = 0).mean()
test['Volume Flow RateRMS nean'] = test['Volume Flow RateRMS'].rolling(window = 10, min_periods = 0).mean()
X_train['Volume Flow RateRMS nean 20'] = X_train['Volume Flow RateRMS'].rolling(window = 20, min_periods = 0).mean()
test['Volume Flow RateRMS nean 20'] = test['Volume Flow RateRMS'].rolling(window = 20, min_periods = 0).mean()
X_train['Volume Flow RateRMS nean 30'] = X_train['Volume Flow RateRMS'].rolling(window = 30, min_periods = 0).mean()
test['Volume Flow RateRMS nean 30'] = test['Volume Flow RateRMS'].rolling(window = 30, min_periods = 0).mean()

X_train['Volume Flow RateRMS shift'] = X_train['Volume Flow RateRMS'].shift(-10).rolling(window = 20, min_periods = 0).mean()
test['Volume Flow RateRMS shift'] = test['Volume Flow RateRMS'].shift(-10).rolling(window = 20, min_periods = 0).mean()


In [266]:
model = LGBMClassifier(random_state=1, max_depth=3, n_estimators = 20000, num_leaves=20, learning_rate=0.01)
model.fit(X_train, y_train)
y_pred=model.predict(X_train)
print(f'Качество на тренировочной: {f1_score(y_true=y_train, y_pred=y_pred)}')

[LightGBM] [Info] Number of positive: 7134, number of negative: 22149
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001015 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4808
[LightGBM] [Info] Number of data points in the train set: 29283, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243623 -> initscore=-1.132920
[LightGBM] [Info] Start training from score -1.132920
Качество на тренировочной: 0.9992991309223437


In [274]:
y_pred=pd.Series(model.predict(test), name = "anomaly")

In [275]:
y_pred.index = test.index
y_pred.index = np.arange(0,9150)
y_pred = y_pred.reset_index()
y_pred.columns = ["id", "anomaly"]

In [276]:
y_pred

Unnamed: 0,id,anomaly
0,0,1.0
1,1,1.0
2,2,1.0
3,3,0.0
4,4,0.0
...,...,...
9145,9145,0.0
9146,9146,0.0
9147,9147,0.0
9148,9148,0.0


In [277]:
y_pred["anomaly"].value_counts()

anomaly
0.0    6967
1.0    2183
Name: count, dtype: int64

In [278]:
y_pred.to_csv("submission.csv", index = False)

In [279]:
import joblib

In [280]:
joblib.dump(model, "model.pkl")

['model.pkl']