# Telstra Network Disruptions

Predicting service faults on Australia's largest telecommunications network

Prepared by Lee Eyler

### Importing Data / Exploring the Datasets

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [3]:
pd.set_option('display.max_columns', 400)
pd.set_option('display.max_rows',200)

In [4]:
# read in data
train = pd.read_csv('train.csv')
test = pd.read_csv('test2.csv')
severity_type = pd.read_csv('severity_type.csv')
resource_type = pd.read_csv('resource_type.csv')
log_feature = pd.read_csv('log_feature.csv')
event_type = pd.read_csv('event_type.csv')

In [231]:
# lengths of datasets
len(train), len(test)

(7381, 11171)

In [232]:
# we will need the lists of ids for filtering purposes thoughout the notebook
train_ids = train.id.tolist()
test_ids = test.id.tolist()

In [10]:
# based on the training data, a fault_severity of 0 is the most likely to occur
train.fault_severity.value_counts()

0    4784
1    1871
2     726
Name: fault_severity, dtype: int64

In [11]:
# unique values for the various data sets
len(train.location.unique()),len(severity_type.severity_type.unique()),len(resource_type.resource_type.unique()),len(log_feature.log_feature.unique()),len(event_type.event_type.unique())

(929, 5, 10, 386, 53)

In [12]:
# ensuring that all ids are accounted for in each data set
len(severity_type.id.unique()),len(resource_type.id.unique()),len(log_feature.id.unique()),len(event_type.id.unique())

(18552, 18552, 18552, 18552)

In [13]:
# exploring the distribution of severity types
severity_type.severity_type.value_counts()

severity_type 2    8737
severity_type 1    8728
severity_type 4    1014
severity_type 5      65
severity_type 3       8
Name: severity_type, dtype: int64

In [14]:
# exploring the distribution of resource types
resource_type.resource_type.value_counts()

resource_type 8     10268
resource_type 2      8918
resource_type 6       582
resource_type 7       498
resource_type 4       330
resource_type 9       190
resource_type 3       145
resource_type 10       73
resource_type 1        58
resource_type 5        14
Name: resource_type, dtype: int64

In [15]:
# exploring the distribution of feature types
log_feature.log_feature.value_counts()

feature 312    5267
feature 232    4754
feature 82     3472
feature 203    2823
feature 313    2145
feature 233    1901
feature 307    1597
feature 54     1573
feature 170    1526
feature 71     1514
feature 315    1495
feature 134    1419
feature 80     1336
feature 235    1294
feature 193    1160
feature 219    1152
feature 68     1093
feature 227    1080
feature 314     950
feature 201     902
feature 234     882
feature 73      868
feature 195     783
feature 301     707
feature 309     627
feature 55      564
feature 229     560
feature 273     491
feature 308     484
feature 368     462
feature 376     460
feature 171     459
feature 228     438
feature 283     420
feature 306     412
feature 291     401
feature 310     399
feature 230     367
feature 70      364
feature 345     321
feature 81      305
feature 191     301
feature 375     295
feature 202     277
feature 56      244
feature 221     235
feature 172     214
feature 209     209
feature 179     207
feature 87      207


In [16]:
# exploring the distribution of event types
event_type.event_type.value_counts()

event_type 11    7888
event_type 35    6615
event_type 34    5927
event_type 15    4395
event_type 20    1458
event_type 54     684
event_type 13     582
event_type 42     478
event_type 44     466
event_type 23     429
event_type 14     330
event_type 43     306
event_type 22     223
event_type 50     154
event_type 10     145
event_type 21     136
event_type 18      73
event_type 47      69
event_type 26      65
event_type 32      63
event_type 30      60
event_type 45      53
event_type 24      46
event_type 27      44
event_type 29      42
event_type 40      40
event_type 46      38
event_type 2       37
event_type 28      32
event_type 8       29
event_type 6       28
event_type 5       26
event_type 7       24
event_type 3       19
event_type 38      19
event_type 36      18
event_type 39      18
event_type 49      17
event_type 53      17
event_type 9       14
event_type 19      14
event_type 31      10
event_type 37      10
event_type 12       6
event_type 25       5
event_type

##### Main Takeaways from Brief Data Exploration

The distribution of class labels is a bit skewed.  

All categorical variables will need to be tranformed into dummy variables.

Most of the feature data sets seem to have edge cases.  If there was time to experiment with feature engineering, I would recommend trying to group together some of the smaller volume categories in order to reduce dimensionality. 

It would also be interesting to try another dimensionality technique such as PCA or LDA.


## Creating Dummy Variables for Categorical Data

###### Severity Type One Hot Encoding

In [18]:
severity_type_ohe = pd.get_dummies(severity_type[['id','severity_type']])
severity_type_ohe.head(3)

Unnamed: 0,id,severity_type_severity_type 1,severity_type_severity_type 2,severity_type_severity_type 3,severity_type_severity_type 4,severity_type_severity_type 5
0,6597,0.0,1.0,0.0,0.0,0.0
1,8011,0.0,1.0,0.0,0.0,0.0
2,2597,0.0,1.0,0.0,0.0,0.0


In [19]:
len(severity_type_ohe)

18552

##### Resource Type One Hot Encoding

In [20]:
resource_type_ohe = pd.get_dummies(resource_type[['id','resource_type']])
resource_type_ohe.head(3)

Unnamed: 0,id,resource_type_resource_type 1,resource_type_resource_type 10,resource_type_resource_type 2,resource_type_resource_type 3,resource_type_resource_type 4,resource_type_resource_type 5,resource_type_resource_type 6,resource_type_resource_type 7,resource_type_resource_type 8,resource_type_resource_type 9
0,6597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,8011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [21]:
len(resource_type_ohe)

21076

In [22]:
resource_type_ohe_grp = resource_type_ohe.groupby('id', as_index=False).count()
resource_type_ohe_grp.head(3)

Unnamed: 0,id,resource_type_resource_type 1,resource_type_resource_type 10,resource_type_resource_type 2,resource_type_resource_type 3,resource_type_resource_type 4,resource_type_resource_type 5,resource_type_resource_type 6,resource_type_resource_type 7,resource_type_resource_type 8,resource_type_resource_type 9
0,1,2,2,2,2,2,2,2,2,2,2
1,2,1,1,1,1,1,1,1,1,1,1
2,3,1,1,1,1,1,1,1,1,1,1


In [23]:
len(resource_type_ohe_grp)

18552

##### Event Type One Hot Encoding

In [25]:
event_type_ohe = pd.get_dummies(event_type[['id','event_type']])
event_type_ohe.head(3)

Unnamed: 0,id,event_type_event_type 1,event_type_event_type 10,event_type_event_type 11,event_type_event_type 12,event_type_event_type 13,event_type_event_type 14,event_type_event_type 15,event_type_event_type 17,event_type_event_type 18,event_type_event_type 19,event_type_event_type 2,event_type_event_type 20,event_type_event_type 21,event_type_event_type 22,event_type_event_type 23,event_type_event_type 24,event_type_event_type 25,event_type_event_type 26,event_type_event_type 27,event_type_event_type 28,event_type_event_type 29,event_type_event_type 3,event_type_event_type 30,event_type_event_type 31,event_type_event_type 32,event_type_event_type 33,event_type_event_type 34,event_type_event_type 35,event_type_event_type 36,event_type_event_type 37,event_type_event_type 38,event_type_event_type 39,event_type_event_type 4,event_type_event_type 40,event_type_event_type 41,event_type_event_type 42,event_type_event_type 43,event_type_event_type 44,event_type_event_type 45,event_type_event_type 46,event_type_event_type 47,event_type_event_type 48,event_type_event_type 49,event_type_event_type 5,event_type_event_type 50,event_type_event_type 51,event_type_event_type 52,event_type_event_type 53,event_type_event_type 54,event_type_event_type 6,event_type_event_type 7,event_type_event_type 8,event_type_event_type 9
0,6597,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8011,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2597,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
len(event_type_ohe)

31170

In [27]:
event_type_ohe_grp = event_type_ohe.groupby('id', as_index=False).count()
event_type_ohe_grp.head(3)

Unnamed: 0,id,event_type_event_type 1,event_type_event_type 10,event_type_event_type 11,event_type_event_type 12,event_type_event_type 13,event_type_event_type 14,event_type_event_type 15,event_type_event_type 17,event_type_event_type 18,event_type_event_type 19,event_type_event_type 2,event_type_event_type 20,event_type_event_type 21,event_type_event_type 22,event_type_event_type 23,event_type_event_type 24,event_type_event_type 25,event_type_event_type 26,event_type_event_type 27,event_type_event_type 28,event_type_event_type 29,event_type_event_type 3,event_type_event_type 30,event_type_event_type 31,event_type_event_type 32,event_type_event_type 33,event_type_event_type 34,event_type_event_type 35,event_type_event_type 36,event_type_event_type 37,event_type_event_type 38,event_type_event_type 39,event_type_event_type 4,event_type_event_type 40,event_type_event_type 41,event_type_event_type 42,event_type_event_type 43,event_type_event_type 44,event_type_event_type 45,event_type_event_type 46,event_type_event_type 47,event_type_event_type 48,event_type_event_type 49,event_type_event_type 5,event_type_event_type 50,event_type_event_type 51,event_type_event_type 52,event_type_event_type 53,event_type_event_type 54,event_type_event_type 6,event_type_event_type 7,event_type_event_type 8,event_type_event_type 9
0,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
2,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [28]:
len(event_type_ohe_grp)

18552

##### Log Feature One Hot Encoding

In [234]:
# log_feature is a bit different than the other datasets
# it essentially has a count of how many times an id has been labeled with
# a particular log feature
# therefore, the volume associated with the id had to be expanded into multiple rows
log_feature.head(3)

Unnamed: 0,id,log_feature,volume
0,6597,feature 68,6
1,8011,feature 68,7
2,2597,feature 68,1


In [31]:
#this would be my prefered method, but my computer kept running out of memory
#log_feature_raw = pd.DataFrame(np.repeat(log_feature.values, log_feature['volume'].values, axis=0), columns=['id','log_feature','volume'])
#log_feature_raw = log_feature_raw.drop('volume',axis=1)
#log_feature_raw.head(3)

In [32]:
# instead, I borrowed this implementation from StackOverflow
log_feature_raw = pd.DataFrame([log_feature.ix[idx] 
                       for idx in log_feature.index 
                       for _ in range(log_feature.ix[idx]['volume'])]).reset_index(drop=True)

In [33]:
len(log_feature_raw)

568246

In [34]:
log_feature_ohe = pd.get_dummies(log_feature_raw[['id','log_feature']])

In [35]:
log_feature_ohe_grp = log_feature_ohe.groupby('id', as_index=False).sum()

In [46]:
len(log_feature_ohe_grp)

18552

## Combining Training and Test Sets to Create a Master DF w/ All Features

In [36]:
full_df = pd.concat([train,test], axis=0, ignore_index=True)
full_df = full_df[['fault_severity','id','location']]
len(full_df), full_df.head(3)

(18552,    fault_severity     id      location
 0             1.0  14121  location 118
 1             0.0   9320   location 91
 2             1.0  14394  location 152)

In [37]:
full_df = full_df.merge(resource_type_ohe_grp,on='id').merge(event_type_ohe_grp,on='id').merge(log_feature_ohe_grp,on='id')
full_df.location = full_df.location.str.strip('location')
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18552 entries, 0 to 18551
Columns: 452 entries, fault_severity to log_feature_feature 99
dtypes: float64(387), int64(64), object(1)
memory usage: 64.1+ MB


In [38]:
full_df.head(3)

Unnamed: 0,fault_severity,id,location,resource_type_resource_type 1,resource_type_resource_type 10,resource_type_resource_type 2,resource_type_resource_type 3,resource_type_resource_type 4,resource_type_resource_type 5,resource_type_resource_type 6,resource_type_resource_type 7,resource_type_resource_type 8,resource_type_resource_type 9,event_type_event_type 1,event_type_event_type 10,event_type_event_type 11,event_type_event_type 12,event_type_event_type 13,event_type_event_type 14,event_type_event_type 15,event_type_event_type 17,event_type_event_type 18,event_type_event_type 19,event_type_event_type 2,event_type_event_type 20,event_type_event_type 21,event_type_event_type 22,event_type_event_type 23,event_type_event_type 24,event_type_event_type 25,event_type_event_type 26,event_type_event_type 27,event_type_event_type 28,event_type_event_type 29,event_type_event_type 3,event_type_event_type 30,event_type_event_type 31,event_type_event_type 32,event_type_event_type 33,event_type_event_type 34,event_type_event_type 35,event_type_event_type 36,event_type_event_type 37,event_type_event_type 38,event_type_event_type 39,event_type_event_type 4,event_type_event_type 40,event_type_event_type 41,event_type_event_type 42,event_type_event_type 43,event_type_event_type 44,event_type_event_type 45,event_type_event_type 46,event_type_event_type 47,event_type_event_type 48,event_type_event_type 49,event_type_event_type 5,event_type_event_type 50,event_type_event_type 51,event_type_event_type 52,event_type_event_type 53,event_type_event_type 54,event_type_event_type 6,event_type_event_type 7,event_type_event_type 8,event_type_event_type 9,log_feature_feature 1,log_feature_feature 10,log_feature_feature 100,log_feature_feature 101,log_feature_feature 102,log_feature_feature 103,log_feature_feature 104,log_feature_feature 105,log_feature_feature 106,log_feature_feature 107,log_feature_feature 108,log_feature_feature 109,log_feature_feature 11,log_feature_feature 110,log_feature_feature 111,log_feature_feature 112,log_feature_feature 113,log_feature_feature 114,log_feature_feature 115,log_feature_feature 116,log_feature_feature 117,log_feature_feature 118,log_feature_feature 119,log_feature_feature 12,log_feature_feature 120,log_feature_feature 121,log_feature_feature 122,log_feature_feature 123,log_feature_feature 124,log_feature_feature 125,log_feature_feature 126,log_feature_feature 127,log_feature_feature 128,log_feature_feature 129,log_feature_feature 13,log_feature_feature 130,log_feature_feature 131,log_feature_feature 132,log_feature_feature 133,log_feature_feature 134,log_feature_feature 135,log_feature_feature 136,log_feature_feature 137,log_feature_feature 138,log_feature_feature 139,log_feature_feature 14,log_feature_feature 140,log_feature_feature 141,log_feature_feature 142,log_feature_feature 143,log_feature_feature 144,log_feature_feature 145,log_feature_feature 146,log_feature_feature 147,log_feature_feature 148,log_feature_feature 149,log_feature_feature 15,log_feature_feature 150,log_feature_feature 151,log_feature_feature 152,log_feature_feature 153,log_feature_feature 154,log_feature_feature 155,log_feature_feature 156,log_feature_feature 157,log_feature_feature 158,log_feature_feature 159,log_feature_feature 16,log_feature_feature 160,log_feature_feature 161,log_feature_feature 162,log_feature_feature 163,log_feature_feature 164,log_feature_feature 165,log_feature_feature 166,log_feature_feature 167,log_feature_feature 168,log_feature_feature 169,log_feature_feature 17,log_feature_feature 170,log_feature_feature 171,log_feature_feature 172,log_feature_feature 173,log_feature_feature 174,log_feature_feature 175,log_feature_feature 176,log_feature_feature 177,log_feature_feature 178,log_feature_feature 179,log_feature_feature 18,log_feature_feature 180,log_feature_feature 181,log_feature_feature 182,log_feature_feature 183,log_feature_feature 184,log_feature_feature 185,log_feature_feature 186,log_feature_feature 187,log_feature_feature 188,log_feature_feature 189,log_feature_feature 19,log_feature_feature 190,log_feature_feature 191,log_feature_feature 192,log_feature_feature 193,log_feature_feature 194,log_feature_feature 195,log_feature_feature 196,log_feature_feature 197,log_feature_feature 198,log_feature_feature 199,log_feature_feature 2,log_feature_feature 20,log_feature_feature 200,log_feature_feature 201,log_feature_feature 202,log_feature_feature 203,log_feature_feature 204,log_feature_feature 205,log_feature_feature 206,log_feature_feature 207,log_feature_feature 208,log_feature_feature 209,log_feature_feature 21,log_feature_feature 210,log_feature_feature 211,log_feature_feature 212,log_feature_feature 213,log_feature_feature 214,log_feature_feature 215,log_feature_feature 216,log_feature_feature 217,log_feature_feature 218,log_feature_feature 219,...,log_feature_feature 267,log_feature_feature 268,log_feature_feature 269,log_feature_feature 27,log_feature_feature 270,log_feature_feature 271,log_feature_feature 272,log_feature_feature 273,log_feature_feature 274,log_feature_feature 275,log_feature_feature 276,log_feature_feature 277,log_feature_feature 278,log_feature_feature 279,log_feature_feature 28,log_feature_feature 280,log_feature_feature 281,log_feature_feature 282,log_feature_feature 283,log_feature_feature 284,log_feature_feature 285,log_feature_feature 286,log_feature_feature 287,log_feature_feature 288,log_feature_feature 289,log_feature_feature 29,log_feature_feature 290,log_feature_feature 291,log_feature_feature 292,log_feature_feature 293,log_feature_feature 294,log_feature_feature 295,log_feature_feature 296,log_feature_feature 297,log_feature_feature 298,log_feature_feature 299,log_feature_feature 3,log_feature_feature 30,log_feature_feature 300,log_feature_feature 301,log_feature_feature 302,log_feature_feature 303,log_feature_feature 304,log_feature_feature 305,log_feature_feature 306,log_feature_feature 307,log_feature_feature 308,log_feature_feature 309,log_feature_feature 31,log_feature_feature 310,log_feature_feature 311,log_feature_feature 312,log_feature_feature 313,log_feature_feature 314,log_feature_feature 315,log_feature_feature 316,log_feature_feature 317,log_feature_feature 318,log_feature_feature 319,log_feature_feature 32,log_feature_feature 320,log_feature_feature 321,log_feature_feature 322,log_feature_feature 323,log_feature_feature 324,log_feature_feature 325,log_feature_feature 326,log_feature_feature 327,log_feature_feature 328,log_feature_feature 329,log_feature_feature 33,log_feature_feature 330,log_feature_feature 331,log_feature_feature 332,log_feature_feature 333,log_feature_feature 334,log_feature_feature 335,log_feature_feature 336,log_feature_feature 337,log_feature_feature 338,log_feature_feature 339,log_feature_feature 34,log_feature_feature 340,log_feature_feature 341,log_feature_feature 342,log_feature_feature 343,log_feature_feature 344,log_feature_feature 345,log_feature_feature 346,log_feature_feature 347,log_feature_feature 348,log_feature_feature 349,log_feature_feature 35,log_feature_feature 350,log_feature_feature 351,log_feature_feature 352,log_feature_feature 353,log_feature_feature 354,log_feature_feature 355,log_feature_feature 356,log_feature_feature 357,log_feature_feature 358,log_feature_feature 359,log_feature_feature 36,log_feature_feature 360,log_feature_feature 361,log_feature_feature 362,log_feature_feature 363,log_feature_feature 364,log_feature_feature 365,log_feature_feature 366,log_feature_feature 367,log_feature_feature 368,log_feature_feature 369,log_feature_feature 37,log_feature_feature 370,log_feature_feature 371,log_feature_feature 372,log_feature_feature 373,log_feature_feature 374,log_feature_feature 375,log_feature_feature 376,log_feature_feature 377,log_feature_feature 378,log_feature_feature 379,log_feature_feature 38,log_feature_feature 380,log_feature_feature 381,log_feature_feature 382,log_feature_feature 383,log_feature_feature 384,log_feature_feature 385,log_feature_feature 386,log_feature_feature 39,log_feature_feature 4,log_feature_feature 40,log_feature_feature 41,log_feature_feature 42,log_feature_feature 43,log_feature_feature 44,log_feature_feature 45,log_feature_feature 46,log_feature_feature 47,log_feature_feature 48,log_feature_feature 49,log_feature_feature 5,log_feature_feature 50,log_feature_feature 51,log_feature_feature 52,log_feature_feature 53,log_feature_feature 54,log_feature_feature 55,log_feature_feature 56,log_feature_feature 57,log_feature_feature 58,log_feature_feature 59,log_feature_feature 6,log_feature_feature 60,log_feature_feature 61,log_feature_feature 62,log_feature_feature 63,log_feature_feature 64,log_feature_feature 65,log_feature_feature 66,log_feature_feature 67,log_feature_feature 68,log_feature_feature 69,log_feature_feature 7,log_feature_feature 70,log_feature_feature 71,log_feature_feature 72,log_feature_feature 73,log_feature_feature 74,log_feature_feature 75,log_feature_feature 76,log_feature_feature 77,log_feature_feature 78,log_feature_feature 79,log_feature_feature 8,log_feature_feature 80,log_feature_feature 81,log_feature_feature 82,log_feature_feature 83,log_feature_feature 84,log_feature_feature 85,log_feature_feature 86,log_feature_feature 87,log_feature_feature 88,log_feature_feature 89,log_feature_feature 9,log_feature_feature 90,log_feature_feature 91,log_feature_feature 92,log_feature_feature 93,log_feature_feature 94,log_feature_feature 95,log_feature_feature 96,log_feature_feature 97,log_feature_feature 98,log_feature_feature 99
0,1.0,14121,118,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,9320,91,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,14394,152,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Split back into Test and Train Sets

In [40]:
train_full = full_df[full_df.id.isin(train_ids)]
test_full = full_df[full_df.id.isin(test_ids)]
len(train_full),len(test_full)

(7381, 11171)

In [46]:
X_train = train_full.iloc[:, 2:].values #training set, all data, no labels
y_train = train_full.iloc[:, 0].values #training set, no data, only labels
test_class = test_full.iloc[:, 2:].values #test set, all data values, no labels
submission = test_full.iloc[:,1:3] #id, location

## Create Test and Train sets from Training Data for Cross Validation

In [115]:
from sklearn.model_selection import train_test_split

X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

## Random Forest Classifier Predictions

In [168]:
# the first model that I selected to experiment with is the RandomForestClassifer
# it is a flexible model that can be effective for high dimensional datasets and 
# non-linear problems.  
# additionally, the RandomForestClassifer can typically perform well without much tuning
# the trade-off is interpretability.

from sklearn.ensemble import RandomForestClassifier

# 50 trees, default parameters
forest_50 = RandomForestClassifier(criterion='gini',
                               n_estimators=50,
                               random_state=1,
                               n_jobs=2)

forest_50.fit(X_train_cv,y_train_cv)
y_predict = forest_50.predict(X_test_cv)
print('Misclassification Samples: %d' % (sum(y_test_cv != y_predict)) )

Misclassification Samples: 578


In [190]:
# 100 trees, default parameters
forest_100 = RandomForestClassifier(criterion='gini',
                               n_estimators=100,
                               random_state=1,
                               n_jobs=2)

forest_100.fit(X_train_cv,y_train_cv)
y_predict = forest_100.predict(X_test_cv)
print('Misclassification Samples: %d' % (sum(y_test_cv != y_predict)) )

Misclassification Samples: 570


In [170]:
# 150 trees, default parameters
forest_150 = RandomForestClassifier(criterion='gini',
                               n_estimators=150,
                               random_state=1,
                               n_jobs=2)

forest_150.fit(X_train_cv,y_train_cv)
y_predict = forest_150.predict(X_test_cv)
print('Misclassification Samples: %d' % (sum(y_test_cv != y_predict)) )

Misclassification Samples: 579


In [171]:
# 200 trees, default parameters
forest_200 = RandomForestClassifier(criterion='gini',
                               n_estimators=200,
                               random_state=1,
                               n_jobs=2)

forest_200.fit(X_train_cv,y_train_cv)
y_predict = forest_200.predict(X_test_cv)
print('Misclassification Samples: %d' % (sum(y_test_cv != y_predict)) )

Misclassification Samples: 573


In [172]:
# 250 trees, default parameters
# looks 250 does provide additional accuracy in comparison to 200
# let's keep the model at 200 and adjust the max_features parameter
forest_250 = RandomForestClassifier(criterion='gini',
                               n_estimators=250,
                               random_state=1,
                               n_jobs=2)

forest_250.fit(X_train_cv,y_train_cv)
y_predict = forest_250.predict(X_test_cv)
print('Misclassification Samples: %d' % (sum(y_test_cv != y_predict)) )

Misclassification Samples: 573


In [175]:
# 200 trees, 30 max_features
# fyi - max_features is sqrt(total number of features)
# which equates to ~21 for this data set
forest_200 = RandomForestClassifier(criterion='gini',
                                    n_estimators=200,
                                    max_features= 30,
                                    random_state=1,
                                    n_jobs=2)

forest_200.fit(X_train_cv,y_train_cv)
y_predict = forest_200.predict(X_test_cv)
print('Misclassification Samples: %d' % (sum(y_test_cv != y_predict)) )

Misclassification Samples: 567


In [176]:
# 200 trees, 40 max_features
# fyi - max_features is sqrt(total number of features)
# which equates to ~21 for this data set
forest_200 = RandomForestClassifier(criterion='gini',
                                    n_estimators=200,
                                    max_features= 40,
                                    random_state=1,
                                    n_jobs=2)

forest_200.fit(X_train_cv,y_train_cv)
y_predict = forest_200.predict(X_test_cv)
print('Misclassification Samples: %d' % (sum(y_test_cv != y_predict)) )

Misclassification Samples: 566


In [178]:
# 200 trees, 50 max_features
# fyi - max_features is sqrt(total number of features)
# which equates to ~21 for this data set
forest_200 = RandomForestClassifier(criterion='gini',
                                    n_estimators=200,
                                    max_features= 50,
                                    random_state=1,
                                    n_jobs=2)

forest_200.fit(X_train_cv,y_train_cv)
y_predict = forest_200.predict(X_test_cv)
print('Misclassification Samples: %d' % (sum(y_test_cv != y_predict)) )

Misclassification Samples: 571


In [191]:
# 200 trees, 10 max_features
# fyi - max_features is sqrt(total number of features)
# which equates to ~21 for this data set
forest_200 = RandomForestClassifier(criterion='gini',
                                    n_estimators=200,
                                    max_features= 15,
                                    random_state=1,
                                    n_jobs=2)

forest_200.fit(X_train_cv,y_train_cv)
y_predict = forest_200.predict(X_test_cv)
print('Misclassification Samples: %d' % (sum(y_test_cv != y_predict)) )

Misclassification Samples: 577


##### RandomForestClassifier Takeaways

Increasing the number of trees up to 200 seems to be impactful, however after this point there appear to be diminishing returns.

At 200 trees with the default parameters, the classifer is at max using ~21 features from the training dataset.  Lowering the amount of features had a negative effect on performance.  Slightly increasing the amount of features above the default had a positive impact.  The risk by continuing to increase the max number of features is potential overfitting.

##### RandomForestClassifier Submission for Kaggle

In [200]:
# final RF model
forest = RandomForestClassifier(criterion='gini',
                                    n_estimators=200,
                                    max_features= 30,
                                    random_state=1,
                                    n_jobs=2)

forest.fit(X_train,y_train)
submission['predict_0'] = forest.predict_proba(test_class)[:,0]
submission['predict_1'] = forest.predict_proba(test_class)[:,1]
submission['predict_2'] = forest.predict_proba(test_class)[:,2]

In [201]:
submission.head()

Unnamed: 0,id,location,predict_0,predict_1,predict_2
7381,11066,481,1.0,0.0,0.0
7382,18000,962,0.075,0.065,0.86
7383,16964,491,1.0,0.0,0.0
7384,4795,532,0.745,0.25,0.005
7385,3392,600,0.635,0.345,0.02


In [65]:
len(submission)

11171

In [202]:
submission.drop('location', axis=1).to_csv('submission6.csv',index=False)

## AdaBoost Prediction

In [237]:
# AdaBoost is an ensemble method with the base implementation using DecisionTreeClassifier.
# The classification method focused on 'weak learners', where the base classifiers
# have only a slightly better performance than random guessing.
# after each round of boosting the misclassified samples receive a larger focus (and weight) 
# during the next round as to try to place a larger emphasis on correctly classifying them.
# AdaBoost is new to me but I thought it would be fun to try as it does not require substantial 
# parameter tuning.

In [206]:
from sklearn.ensemble import AdaBoostClassifier

In [236]:
# there is typically a trade-off between number of estimators and learning rate
# a lower learning rate has the effect of making less corrections for each tree
# added to the model, thereby typically needing more trees to correct for errors
# let's adjust a few parameters and see if it can outperform the random forest classifier

# 200 estimators, learning rate .10
ada = AdaBoostClassifier(n_estimators=200, learning_rate=.10, random_state=1)
ada.fit(X_train_cv,y_train_cv)
y_predict = ada.predict(X_test_cv)
print('Misclassification Samples: %d' % (sum(y_test_cv != y_predict)) )

Misclassification Samples: 616


In [219]:
# 400 estimators, learning rate .10
ada = AdaBoostClassifier(n_estimators=400, learning_rate=.10, random_state=1)
ada.fit(X_train_cv,y_train_cv)
y_predict = ada.predict(X_test_cv)
print('Misclassification Samples: %d' % (sum(y_test_cv != y_predict)) )

Misclassification Samples: 596


In [220]:
# 400 estimators, learning rate .15
ada = AdaBoostClassifier(n_estimators=400, learning_rate=.15, random_state=1)
ada.fit(X_train_cv,y_train_cv)
y_predict = ada.predict(X_test_cv)
print('Misclassification Samples: %d' % (sum(y_test_cv != y_predict)) )

Misclassification Samples: 578


In [221]:
# 400 estimators, learning rate .30
ada = AdaBoostClassifier(n_estimators=400, learning_rate=.3, random_state=1)
ada.fit(X_train_cv,y_train_cv)
y_predict = ada.predict(X_test_cv)
print('Misclassification Samples: %d' % (sum(y_test_cv != y_predict)) )

Misclassification Samples: 559


In [223]:
# 400 estimators, learning rate .30
ada = AdaBoostClassifier(n_estimators=600, learning_rate=.3, random_state=1)
ada.fit(X_train_cv,y_train_cv)
y_predict = ada.predict(X_test_cv)
print('Misclassification Samples: %d' % (sum(y_test_cv != y_predict)) )

Misclassification Samples: 558


##### AdaBoost Takeaways

The AdaBoost model seemed to respond well to increasing the number of trees up to 400 and then increasing the learning up to .30.

##### AdaBoost Classifier Submission for Kaggle

In [225]:
ada_submission = test_full.iloc[:,1:3] #id, location

In [226]:
# final adaboost model
ada = AdaBoostClassifier(n_estimators=400, learning_rate=.1, random_state=1)
ada.fit(X_train,y_train)

ada_submission['predict_0'] = ada.predict_proba(test_class)[:,0]
ada_submission['predict_1'] = ada.predict_proba(test_class)[:,1]
ada_submission['predict_2'] = ada.predict_proba(test_class)[:,2]

In [230]:
# interestingly it looks like the ada model is hesitant to predict too heavily towards any single class
# this is definitely something that I will look into further.
ada_submission.head(5)

Unnamed: 0,id,location,predict_0,predict_1,predict_2
7381,11066,481,0.3494,0.344181,0.306418
7382,18000,962,0.333382,0.330769,0.335849
7383,16964,491,0.349076,0.344734,0.30619
7384,4795,532,0.337433,0.33628,0.326287
7385,3392,600,0.334323,0.336469,0.329209


In [228]:
ada_submission.drop('location', axis=1).to_csv('submission7.csv',index=False)