The business objective was to build a predictive model to predict service disruptions based on the data generated by multiple devices.

## Importing Modules and Datasets 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 

In [2]:
event = pd.read_csv('event_type.csv')
log = pd.read_csv('log_feature.csv')
resource = pd.read_csv('resource_type.csv')
severity = pd.read_csv('severity_type.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
event.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31170 entries, 0 to 31169
Data columns (total 2 columns):
id            31170 non-null int64
event_type    31170 non-null object
dtypes: int64(1), object(1)
memory usage: 487.1+ KB


In [4]:
event.head()

Unnamed: 0,id,event_type
0,6597,event_type 11
1,8011,event_type 15
2,2597,event_type 15
3,5022,event_type 15
4,5022,event_type 11


In [5]:
log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58671 entries, 0 to 58670
Data columns (total 3 columns):
id             58671 non-null int64
log_feature    58671 non-null object
volume         58671 non-null int64
dtypes: int64(2), object(1)
memory usage: 1.3+ MB


In [6]:
log.head()

Unnamed: 0,id,log_feature,volume
0,6597,feature 68,6
1,8011,feature 68,7
2,2597,feature 68,1
3,5022,feature 172,2
4,5022,feature 56,1


In [7]:
resource.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21076 entries, 0 to 21075
Data columns (total 2 columns):
id               21076 non-null int64
resource_type    21076 non-null object
dtypes: int64(1), object(1)
memory usage: 329.4+ KB


In [8]:
resource.head()

Unnamed: 0,id,resource_type
0,6597,resource_type 8
1,8011,resource_type 8
2,2597,resource_type 8
3,5022,resource_type 8
4,6852,resource_type 8


In [9]:
severity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18552 entries, 0 to 18551
Data columns (total 2 columns):
id               18552 non-null int64
severity_type    18552 non-null object
dtypes: int64(1), object(1)
memory usage: 290.0+ KB


In [10]:
severity.head()

Unnamed: 0,id,severity_type
0,6597,severity_type 2
1,8011,severity_type 2
2,2597,severity_type 2
3,5022,severity_type 1
4,6852,severity_type 1


In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7381 entries, 0 to 7380
Data columns (total 3 columns):
id                7381 non-null int64
location          7381 non-null object
fault_severity    7381 non-null int64
dtypes: int64(2), object(1)
memory usage: 173.1+ KB


In [12]:
train.head()

Unnamed: 0,id,location,fault_severity
0,14121,location 118,1
1,9320,location 91,0
2,14394,location 152,1
3,8218,location 931,1
4,14804,location 120,0


The datasets contain various information such as ID, event_type, log_feature, volume, resource_type, severity_type, location, and fault_severity.

## Data Preparation

In [13]:
#Preparing for testing the prediction model 
fault_severity = train['fault_severity']

In [14]:
del train['fault_severity']

In [15]:
df1 = train.merge(event, on='id')
df1 = df1.merge(resource, on='id')
df1 = df1.merge(severity, on='id')
df1 = df1.merge(log, on='id')

All the datasets are merged into a single DataFrame by merging them together by ID. 

In [16]:
df1.head(5)

Unnamed: 0,id,location,event_type,resource_type,severity_type,log_feature,volume
0,14121,location 118,event_type 34,resource_type 2,severity_type 2,feature 312,19
1,14121,location 118,event_type 34,resource_type 2,severity_type 2,feature 232,19
2,14121,location 118,event_type 35,resource_type 2,severity_type 2,feature 312,19
3,14121,location 118,event_type 35,resource_type 2,severity_type 2,feature 232,19
4,9320,location 91,event_type 34,resource_type 2,severity_type 2,feature 315,200


In [17]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61839 entries, 0 to 61838
Data columns (total 7 columns):
id               61839 non-null int64
location         61839 non-null object
event_type       61839 non-null object
resource_type    61839 non-null object
severity_type    61839 non-null object
log_feature      61839 non-null object
volume           61839 non-null int64
dtypes: int64(2), object(5)
memory usage: 3.8+ MB


In [18]:
df1['event_type'] = df1['event_type'].str.split(" ", expand=True)[1]
df1['log_feature'] = df1['log_feature'].str.split(" ", expand=True)[1]
df1['resource_type'] = df1['resource_type'].str.split(" ", expand=True)[1]
df1['severity_type'] = df1['severity_type'].str.split(" ", expand=True)[1]
df1['location'] = df1['location'].str.split(" ", expand=True)[1]

To prepare for the the prediction model, the text is removed from variables.

In [19]:
df1.head()

Unnamed: 0,id,location,event_type,resource_type,severity_type,log_feature,volume
0,14121,118,34,2,2,312,19
1,14121,118,34,2,2,232,19
2,14121,118,35,2,2,312,19
3,14121,118,35,2,2,232,19
4,9320,91,34,2,2,315,200


In [20]:
df1['severity_type'].value_counts()

1    36571
2    24260
4      920
5       55
3       33
Name: severity_type, dtype: int64

In [21]:
#Create dummy variables 
df_dummies = pd.get_dummies(df1, drop_first=True)

In [22]:
#Using a groupby to calculate the sums of the features which are sorted by ID 
df_group = df_dummies.groupby(['id'],sort=False).sum()
df_group.head()

Unnamed: 0_level_0,volume,location_10,location_100,location_1000,location_1002,location_1005,location_1006,location_1007,location_1008,location_1009,...,log_feature_9,log_feature_90,log_feature_91,log_feature_92,log_feature_94,log_feature_95,log_feature_96,log_feature_97,log_feature_98,log_feature_99
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14121,76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9320,632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14394,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8218,44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14804,96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
df_group.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7381 entries, 14121 to 17067
Columns: 1320 entries, volume to log_feature_99
dtypes: float64(1319), int64(1)
memory usage: 74.4 MB


In [24]:
#Prepare for for prediction
X = df_group
y  = fault_severity

In [25]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [26]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((5904, 1320), (1477, 1320), (5904,), (1477,))

## Modeling for Prediction 

In [27]:
#Import Libraries for Modeling 
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
from sklearn.model_selection import cross_val_score as cvs
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [28]:
gbc.fit(x_train,y_train)
#Examining the base base parameters of Gradient Boosting Classifier 

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [29]:
param_test1 = {'n_estimators': range(50,201,50)}

Using Gradient Boosting Classifier to help model our prediction, I wanted to optimize the model by examining various parameters to strengthen the prediction model. 

In [30]:
gbc1 = GridSearchCV(estimator = gbc, param_grid = param_test1, cv = 5)

In [31]:
gbc1.fit(x_train, y_train)
gbc1

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': range(50, 201, 50)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [32]:
y_prediction = gbc1.predict(x_test)
y_prediction

array([0, 1, 0, ..., 0, 0, 1], dtype=int64)

In [33]:
gbc1.cv_results_, gbc1.best_params_, gbc1.best_score_



({'mean_fit_time': array([ 9.28454757, 15.90244832, 21.68318119, 27.40917988]),
  'std_fit_time': array([0.29414807, 0.31724356, 0.32423323, 0.29410035]),
  'mean_score_time': array([0.01634941, 0.02113047, 0.02452431, 0.02864366]),
  'std_score_time': array([0.00135995, 0.00117608, 0.00510987, 0.00337314]),
  'param_n_estimators': masked_array(data=[50, 100, 150, 200],
               mask=[False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'n_estimators': 50},
   {'n_estimators': 100},
   {'n_estimators': 150},
   {'n_estimators': 200}],
  'split0_test_score': array([0.74365482, 0.74111675, 0.74450085, 0.74957699]),
  'split1_test_score': array([0.71658206, 0.71489002, 0.71404399, 0.71319797]),
  'split2_test_score': array([0.71634208, 0.7188823 , 0.723116  , 0.72904318]),
  'split3_test_score': array([0.73474576, 0.73644068, 0.74067797, 0.73728814]),
  'split4_test_score': array([0.7336726 , 0.73706531, 0.7336726 , 0.73706531]),
  'mean_t

The best score of Gradient Boosting Classifier was when the n_estimators = 200.

In [34]:
#Create a Classifier Prediction of Fault Severity 
y_prediction = gbc1.best_estimator_.predict(x_test)
y_prediction

array([0, 1, 0, ..., 0, 0, 1], dtype=int64)

In [35]:
#Creating Various Probabilities of Fault Severity  
y_predproba = gbc1.best_estimator_.predict_proba(x_test)
y_predproba

array([[0.53678503, 0.39888393, 0.06433104],
       [0.34455574, 0.63334398, 0.02210028],
       [0.71165125, 0.24802978, 0.04031897],
       ...,
       [0.91960681, 0.07357762, 0.00681558],
       [0.94705829, 0.04517766, 0.00776405],
       [0.22332362, 0.46837942, 0.30829696]])

In [36]:
#DataFrame of the predictions by ID 
result = pd.DataFrame({
        "id": x_test.index,
        "Predicted_fault_severity": y_prediction,
        "Prediction_probability_0": y_predproba[:,0],
        "Prediction_probability_1": y_predproba[:,1],
        "Prediction_probability_2": y_predproba[:,2],
        }, columns = ['id','Predicted_fault_severity','Prediction_probability_0','Prediction_probability_1','Prediction_probability_2'])

In [37]:
result.head()

Unnamed: 0,id,Predicted_fault_severity,Prediction_probability_0,Prediction_probability_1,Prediction_probability_2
0,11694,0,0.536785,0.398884,0.064331
1,17267,1,0.344556,0.633344,0.0221
2,15697,0,0.711651,0.24803,0.040319
3,13556,0,0.547739,0.445065,0.007196
4,7208,0,0.854967,0.12787,0.017164


In [38]:
#Convert DataFrame to CSV File 
result.to_csv('fault_severity_final.csv', index=False)