In [1]:
import pandas as pd
df = pd.read_csv('bag_of_words.csv')

In [31]:
df.columns[-1000:]

Index(['draft revise special', 'preparation of motion', 'inc. re overdue',
       'correspondence to developers', 'notice of hearing',
       'preparation of draft', 'cease all work', 'counsel regarding upcoming',
       'responses to request', 'response to plaintiffs',
       ...
       'Line Item Rate', 'Timekeeper Level', 'UTBMS Activity Code',
       'UTBMS Phase Code', 'UTBMS Task Code', 'Work Area', 'Work Area Level 2',
       'Is_Prior_Approved', 'Is_Action', 'Class_Label'],
      dtype='object', length=1000)

In [5]:
from sklearn.cross_validation import train_test_split
training_set_temp, test_set_temp = train_test_split(df, test_size=0.2, random_state=1)
training_set_temp, dev_set_temp = train_test_split(training_set_temp, test_size=0.2, random_state=1)



In [6]:
training_set_temp.shape, dev_set_temp.shape, test_set_temp.shape

((30192, 3556), (7548, 3556), (9436, 3556))

In [7]:
def fill_feature_using_training(train_set, other_set):
    missing_cols  = set(train_set.columns) - set(other_set.columns)
    new_cols      = set(other_set.columns) - set(train_set.columns)
    
    # Add a missing column in test set with default value equal to 0
    for c in missing_cols:
        other_set[c] = 0
    # Add a missing column in test set with default value equal to 0
    for c in new_cols:
        del other_set[c]
    
    other_set  = other_set[train_set.columns]
    
    return other_set

In [12]:
training_set_temp['Class_Label'] = np.where(training_set_temp['Class_Label']==True, 1, 0)
training_set_temp = pd.get_dummies(training_set_temp, columns=["Timekeeper Level", "UTBMS Activity Code", "Work Area", "Work Area Level 2", 
                                                              "UTBMS Phase Code", "UTBMS Task Code"])

dev_set_temp      = pd.get_dummies(dev_set_temp, columns=["Timekeeper Level", "UTBMS Activity Code", "Work Area", "Work Area Level 2", 
                                                        "UTBMS Phase Code", "UTBMS Task Code"])
dev_set_temp['Class_Label'] = np.where(dev_set_temp['Class_Label']==True, 1, 0)
dev_set_temp = fill_feature_using_training(training_set_temp, dev_set_temp)


test_set_temp      = pd.get_dummies(test_set_temp, columns=["Timekeeper Level", "UTBMS Activity Code", "Work Area", "Work Area Level 2", 
                                                          "UTBMS Phase Code", "UTBMS Task Code"])
test_set_temp['Class_Label'] = np.where(test_set_temp['Class_Label']==True, 1, 0)
test_set_temp = fill_feature_using_training(training_set_temp, test_set_temp)

print(len(training_set_temp.columns), len(dev_set_temp.columns), len(test_set_temp.columns))

## Only for Bag of words features
for feature in training_set_temp.columns:
    if feature =='Line Item Rate':
        break
    training_set_temp[feature] = training_set_temp[feature].astype(np.uint8)
    dev_set_temp[feature] = dev_set_temp[feature].astype(np.uint8)
    test_set_temp[feature] = test_set_temp[feature].astype(np.uint8)

3630 3630 3630


NameError: name 'code' is not defined

In [13]:
training_set_temp.to_csv('training_final_V1.csv', quoting=1, index=None)
dev_set_temp.to_csv('dev_final_V1.csv', quoting=1, index=None)
test_set_temp.to_csv('test_final_V1.csv', quoting=1, index=None)

### Training classifier

In [14]:


import numpy as np 
import pandas as pd 
from sklearn.metrics import confusion_matrix 
from sklearn.cross_validation import train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.metrics import auc, roc_curve


In [15]:

def train_using_gini(X_train, y_train, max_depth_temp, min_sample_leaf_temp): 
  
    # Creating the classifier object 
    clf_gini = DecisionTreeClassifier(criterion = "gini", 
            random_state = 100, max_depth=max_depth_temp, min_samples_leaf=min_sample_leaf_temp) 
  
    # Performing training 
    clf_gini.fit(X_train, y_train) 
    return clf_gini


In [16]:
# Function to perform training with entropy. 
def train_using_entropy(X_train, y_train,max_depth_temp, min_sample_leaf_temp): 
  
    # Decision tree with entropy 
    clf_entropy = DecisionTreeClassifier( 
            criterion = "entropy", random_state = 100, 
            max_depth = max_depth_temp, min_samples_leaf = min_sample_leaf_temp) 
  
    # Performing training 
    clf_entropy.fit(X_train, y_train) 
    return clf_entropy 


In [17]:

# Function to make predictions 
def prediction(X_test, clf_object): 
  
    # Predicton on test with giniIndex 
    y_pred = clf_object.predict(X_test) 
    print("Predicted values:") 
    print(y_pred) 
    return y_pred 
      


In [18]:
# Function to calculate accuracy 
def cal_accuracy(y_test, y_pred): 
      
    print("Confusion Matrix: ", 
        confusion_matrix(y_test, y_pred)) 
      
    print ("Accuracy : ", 
    accuracy_score(y_test, y_pred)*100) 
      
    print("Report : ", 
    classification_report(y_test, y_pred)) 
    
    fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
    print('AUC curve: ', auc(fpr, tpr))


In [19]:

#dev_set_temp[dev_set_temp['Miles']=='.']

columns = [column for column in training_set_temp.columns if column not in ['Class_Label']]
X_train = training_set_temp[columns].values
y_train = training_set_temp['Class_Label'].values
X_dev   = dev_set_temp[columns].values
y_dev   = dev_set_temp['Class_Label'].values
X_test   = test_set_temp[columns].values
y_test   = test_set_temp['Class_Label'].values


In [20]:

for depth in [2, 3, 4, 5, 10, 15, 20]:
    print('Depth: ', depth)
    clf_gini = train_using_gini(X_train, y_train, depth, 2) 
    #clf_entropy = train_using_entropy(X_train, y_train, depth) 
    # Operational Phase 
    print("Results Using Gini Index:") 
    # Prediction using gini 
    y_pred_gini = prediction(X_dev, clf_gini)
    #y_pred_gini = prediction(X_dev, clf_entropy)
    cal_accuracy(y_dev, y_pred_gini)
    print('--------------------------------------------------------------------------------')
    print('--------------------------------------------------------------------------------')


Depth:  2
Results Using Gini Index:
Predicted values:
[0 0 0 ... 0 0 0]
Confusion Matrix:  [[7060   11]
 [ 432   45]]
Accuracy :  94.13089560148383
Report :               precision    recall  f1-score   support

          0       0.94      1.00      0.97      7071
          1       0.80      0.09      0.17       477

avg / total       0.93      0.94      0.92      7548

AUC curve:  0.5463919864020728
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Depth:  3
Results Using Gini Index:
Predicted values:
[0 0 0 ... 0 0 0]
Confusion Matrix:  [[7067    4]
 [ 433   44]]
Accuracy :  94.21038685744568
Report :               precision    recall  f1-score   support

          0       0.94      1.00      0.97      7071
          1       0.92      0.09      0.17       477

avg / total       0.94      0.94      0.92      7548

AUC curve:  0.5458387478664294
------------------------------

In [21]:

for depth in [2, 3, 4, 5, 10, 15, 20]:
    print('Depth: ', depth)
    clf_gini = train_using_gini(X_train, y_train, depth, 2) 
    #clf_entropy = train_using_entropy(X_train, y_train, depth) 
    # Operational Phase 
    print("Results Using Gini Index:") 
    # Prediction using gini 
    y_pred_gini = prediction(X_test, clf_gini)
    #y_pred_gini = prediction(X_dev, clf_entropy)
    cal_accuracy(y_test, y_pred_gini)
    print('--------------------------------------------------------------------------------')
    print('--------------------------------------------------------------------------------')


Depth:  2
Results Using Gini Index:
Predicted values:
[0 0 0 ... 0 0 0]
Confusion Matrix:  [[8790   14]
 [ 568   64]]
Accuracy :  93.83213225943197
Report :               precision    recall  f1-score   support

          0       0.94      1.00      0.97      8804
          1       0.82      0.10      0.18       632

avg / total       0.93      0.94      0.92      9436

AUC curve:  0.5498378182529231
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Depth:  3
Results Using Gini Index:
Predicted values:
[0 0 0 ... 0 0 0]
Confusion Matrix:  [[8797    7]
 [ 570   62]]
Accuracy :  93.8851208139042
Report :               precision    recall  f1-score   support

          0       0.94      1.00      0.97      8804
          1       0.90      0.10      0.18       632

avg / total       0.94      0.94      0.92      9436

AUC curve:  0.5486530863416514
-------------------------------

### Training model using XGBoost

In [None]:
param_test1 = {
 'subsample': [i/10.0 for i in range(6,10)],
 'max_depth':range(3,16,2),
 'min_child_weight':range(1,6,2),

}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [44]:
subsample = [i/10.0 for i in range(6,10)]
max_depth = list(range(3,16,2))
min_child_weight = list(range(1,6,2))

In [46]:
for sample in subsample:
    for depth in max_depth:
        for child in min_child_weight:
            print('--------------------------------------------------------------------------------')
            print('--------------------------------------------------------------------------------')
            print('Sample: {}, Depth: {}, Child: {}'.format(sample, depth, child))
            xg_train = create_data_from_df(X_train, y_train)
            xg_dev   = create_data_from_df(X_dev , y_dev)
            param_new = {'subsample': sample, 'max_depth': depth, 'eta': 1, 'min_child_weight': child, 'objective': 'binary:logistic', 
                             'nthread' : 4, 'eval_metric' : ['auc']}
            model    = train_classifier(xg_train, xg_dev, num_round=50, param=param_new)

            xg_test   = create_data_from_df(X_test , y_test)
            print(len(y_test), len(X_test))
            y_pred = predict_test_data(model, xg_test)
            print(len(y_pred))
            y_pred = np.where(y_pred>0.8, 0, 1)
            print(len(y_pred))
            
            cal_accuracy(y_test, y_pred)

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sample: 0.6, Depth: 3, Child: 1
[17:15:24] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[0]	eval-auc:0.555781	train-auc:0.55702
[17:15:26] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[1]	eval-auc:0.692218	train-auc:0.689709
[17:15:27] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[2]	eval-auc:0.710877	train-auc:0.729205
[17:15:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[3]	eval-auc:0.735814	train-auc:0.757614
[17:15:30] C:\Users\Administrator\Deskt

In [49]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 1)
rf.fit(X_train, y_train)

predictions_dev = rf.predict(X_dev)
predictions_test = rf.predict(X_test)

cal_accuracy(y_dev, predictions_dev)
cal_accuracy(y_test, predictions_test)

Confusion Matrix:  [[7013   58]
 [ 277  200]]
Accuracy :  95.56173820879704
Report :               precision    recall  f1-score   support

          0       0.96      0.99      0.98      7071
          1       0.78      0.42      0.54       477

avg / total       0.95      0.96      0.95      7548

AUC curve:  0.7055423472078799
Confusion Matrix:  [[8731   73]
 [ 358  274]]
Accuracy :  95.43238660449343
Report :               precision    recall  f1-score   support

          0       0.96      0.99      0.98      8804
          1       0.79      0.43      0.56       632

avg / total       0.95      0.95      0.95      9436

AUC curve:  0.7126263091000063
