## Data processing

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler

In [3]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau

In [4]:
df=pd.read_pickle('/home/centos/data/typed_all_v1_2.pkl')
del df['licenseInfo.drmSystem']

### remove data 'timestamp' == 0

In [5]:
df = df[df['timestamp']!=0]

In [6]:
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df.index = df['timestamp']
del df['timestamp']

In [7]:
df = df.sort_index()

### error code 종류와 빈도수 

In [8]:
df['errorCode'].value_counts()

 0.000000e+00    17861
-1.007020e+05      440
-3.800000e+01      238
-2.005000e+03       38
-1.004000e+03       37
-1.002000e+03       22
-1.008700e+05       14
-1.008740e+05        7
-1.008590e+05        3
-1.200000e+01        1
-1.014160e+05        1
-2.147484e+09        1
Name: errorCode, dtype: int64

### errorcode==0,networkErrors == 0 , mnc == 10 제거 -> 시간별 최빈값으로 resampling하려고할때 이 값들이 모두 대표값이 되어버림

In [9]:
df_errors = df[df['errorCode']!=0.0]
df_networkErrors = df[df['networkErrors']!=0.0]
df_mnc = df[df['networkInfo.carrier.mnc']!= '10']

In [10]:
df_mnc['networkInfo.carrier.mnc']

timestamp
2018-11-06 02:14:03    51
2018-11-06 02:15:04    50
2018-11-06 02:19:40    51
2018-11-06 02:19:54    51
2018-11-06 02:20:25    51
2018-11-06 02:20:35    51
2018-11-06 02:20:39    51
2018-11-06 02:21:19    51
2018-11-06 02:24:28    51
2018-11-06 02:24:45    51
2018-11-06 02:25:46    50
2018-11-06 02:28:25    51
2018-11-06 02:29:36    20
2018-11-06 02:30:01    51
2018-11-06 02:30:46    51
2018-11-06 02:31:17    20
2018-11-06 02:31:59     0
2018-11-06 02:35:30    51
2018-11-06 02:36:12    51
2018-11-06 02:36:38    51
2018-11-06 02:38:20    51
2018-11-06 02:41:24    51
2018-11-06 02:43:20    51
2018-11-06 02:49:02    51
2018-11-06 02:49:21    20
2018-11-06 02:50:17    20
2018-11-06 02:50:20    20
2018-11-06 02:50:27    20
2018-11-06 02:51:10    51
2018-11-06 02:52:20    20
                       ..
2018-11-07 23:21:46    20
2018-11-07 23:23:28    20
2018-11-07 23:27:26    51
2018-11-07 23:28:06    51
2018-11-07 23:29:47    51
2018-11-07 23:30:01    20
2018-11-07 23:34:03    51
20

### 15분 단위로 grouping 하고 그 시간의 mean값을 대표값으로 저장(numeric value)

In [11]:
numeric_mean_value=df.groupby(pd.Grouper(freq='15T') ).mean()

### 15분 단위로 grouping 하고 그 시간의 최빈값을 대표값으로 저장( categorical value)

In [12]:
top_errorCode=df_errors['errorCode'].groupby(pd.Grouper(freq='15T') ).describe().top
top_networkErrors=df_networkErrors['networkErrors'].groupby(pd.Grouper(freq='15T') ).describe().top
top_mnc=df_mnc['networkInfo.carrier.mnc'].groupby(pd.Grouper(freq='15T') ).describe().top
top_networkType=df['networkInfo.type'].groupby(pd.Grouper(freq='15T') ).describe().top
top_contentType=df['content_type'].groupby(pd.Grouper(freq='15T') ).describe().top
top_device=df['device'].groupby(pd.Grouper(freq='15T') ).describe().top


### 15분치 데이터 대표값으로 resampling

In [13]:
result=pd.concat([numeric_mean_value, top_contentType, top_device, top_errorCode, top_networkErrors, top_mnc, top_networkType], axis=1, sort=False)

In [14]:
result.columns = ['estimatedBandwidth', 'fragmentSum.bitrate', 'fragmentSum.downloadTime',
       'fragmentSum.duration', 'fragmentSum.fragmentIndex', 'fragmentSum.size',
       'frameDropped', 'licenseInfo.elapsedTime', 'maxDecodingTime',
       'position', 'qualityChangedCount', 'bufferingTime', 'avgDecodingTime',
       'bufferedDuration', 'currentPosition', 'top_contentType', 'top_device', 'top_errorCode', 'top_networkErrors', 'top_mnc', 'top_networkType']

### 15분동안 가장 많이 발생한 한 값(e.g. errorCode == 0) 들을 원래값 (0)으로 채운다 -> 이래도 되나?

In [15]:
result.top_errorCode = result.top_errorCode.fillna(0)
result.top_networkErrors=result.top_networkErrors.fillna(0)

In [16]:
result.dtypes

estimatedBandwidth           float64
fragmentSum.bitrate          float64
fragmentSum.downloadTime     float64
fragmentSum.duration         float64
fragmentSum.fragmentIndex    float64
fragmentSum.size             float64
frameDropped                 float64
licenseInfo.elapsedTime      float64
maxDecodingTime              float64
position                     float64
qualityChangedCount          float64
bufferingTime                float64
avgDecodingTime              float64
bufferedDuration             float64
currentPosition              float64
top_contentType                int64
top_device                     int64
top_errorCode                float64
top_networkErrors            float64
top_mnc                       object
top_networkType                int64
dtype: object

In [17]:
result.top_contentType = result.top_contentType.astype('float64')
result.top_device = result.top_device.astype('float64')
result.top_networkType = result.top_networkType.astype('float64')
result.top_mnc = result.top_mnc.astype('float64')

### x, y data 할당

In [18]:
y_data=result.top_errorCode.values
del result['top_errorCode']
x_data = result.values

In [19]:
x_data.shape

(185, 20)

In [20]:
y_data = y_data.reshape(185,1)

In [21]:
result.dtypes

estimatedBandwidth           float64
fragmentSum.bitrate          float64
fragmentSum.downloadTime     float64
fragmentSum.duration         float64
fragmentSum.fragmentIndex    float64
fragmentSum.size             float64
frameDropped                 float64
licenseInfo.elapsedTime      float64
maxDecodingTime              float64
position                     float64
qualityChangedCount          float64
bufferingTime                float64
avgDecodingTime              float64
bufferedDuration             float64
currentPosition              float64
top_contentType              float64
top_device                   float64
top_networkErrors            float64
top_mnc                      float64
top_networkType              float64
dtype: object

In [22]:
from sklearn import preprocessing # Min-Max Standardzation

min_max_scaler = preprocessing.MinMaxScaler()
x_data = min_max_scaler.fit_transform(x_data)


training_idx = np.random.randint(y_data.shape[0], size=int(y_data.shape[0] * 0.8))
test_idx = np.random.randint(y_data.shape[0], size=int(y_data.shape[0] * 0.2))

x_train, x_test = x_data[training_idx,:], x_data[test_idx,:]


In [23]:
y_train, y_test = y_data[training_idx,:], y_data[test_idx,:]

x_train.shape, x_test.shape

((148, 20), (37, 20))

In [24]:
from sklearn import tree
from sklearn import metrics
result_tree = tree.DecisionTreeClassifier(criterion='entropy', splitter='random', max_depth=11)
result_tree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=11,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='random')

In [25]:
y_pred_tr =result_tree.predict(x_test)
print('Accuracy: %.3f' % metrics.accuracy_score(y_test, y_pred_tr))

Accuracy: 0.676


In [26]:
print(metrics.classification_report(y_test, y_pred_tr))

              precision    recall  f1-score   support

   -100874.0       0.00      0.00      0.00         1
   -100870.0       0.00      0.00      0.00         1
   -100702.0       0.82      0.88      0.85        16
     -1004.0       1.00      0.60      0.75         5
       -38.0       0.40      0.86      0.55         7
         0.0       1.00      0.29      0.44         7

   micro avg       0.68      0.68      0.68        37
   macro avg       0.54      0.44      0.43        37
weighted avg       0.76      0.68      0.66        37



  'precision', 'predicted', average, warn_for)


In [27]:

from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
np.save('x_train.npy', x_train)

In [None]:
np.save('x_test.npy', x_test)

In [None]:
np.save('y_train.npy', y_train)

In [None]:
np.save('y_test.npy', y_test)

In [None]:
x_data

In [None]:
result

In [None]:
type(y_data)

In [None]:
result.to_pickle('dataframe.pkl')

In [32]:
clf = RandomForestClassifier(n_estimators=100, max_depth=11,
                             random_state=0)
clf.fit(x_train, y_train)
y_pred=clf.predict(x_test)
print(metrics.classification_report(y_test, y_pred))
print('Accuracy: %.2f' % metrics.accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

   -100874.0       0.00      0.00      0.00         1
   -100870.0       0.00      0.00      0.00         1
   -100702.0       0.76      0.81      0.79        16
     -1004.0       1.00      0.60      0.75         5
       -38.0       0.33      0.57      0.42         7
         0.0       1.00      0.71      0.83         7

   micro avg       0.68      0.68      0.68        37
   macro avg       0.52      0.45      0.47        37
weighted avg       0.72      0.68      0.68        37

Accuracy: 0.68


  This is separate from the ipykernel package so we can avoid doing imports until
  'precision', 'predicted', average, warn_for)
