In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
data_folder = 'att_hackathon/datasets'

In [2]:
#_file = 'ML_DATASET_Hackathon_Supervised.csv'
_file = 'version_2.csv'

In [3]:
def load_file(path):
  df = pd.read_csv(path)
  return df

def create_anomaly_group(df, cutoff):
  df.loc[data['percentile'] > cutoff,'Team'] = 'ANOMALIES'
  return df

def get_non_stratifiable(df, count):
  counts = pd.DataFrame(df.Team.value_counts()).reset_index()
  non_stratifiable = list(counts[counts['Team'] < count]['index'])
  data_to_strat = df[~df.Team.isin(non_stratifiable)]
  extra = df[df.Team.isin(non_stratifiable)]
  return extra, data_to_strat

def merge_non_stratified_with_train(x_train, y_train, extra):
    x_train = pd.concat([x_train,extra[['Problem_Abstract']]], axis=0)
    y_train = pd.concat([y_train,extra[['Team']]], axis=0)
    return x_train, y_train

In [5]:
data = load_file(os.path.join(data_folder, _file))
data = data.fillna('')
data = create_anomaly_group(data, 0.8)
#extra, data_to_strat = get_non_stratifiable(data, 2)
x_train, x_test, y_train, y_test = train_test_split(data[['Problem_Abstract']], 
                                                    data[['Team']], 
                                                    test_size=0.2, 
                                                    stratify=data[['Team']])
#x_train, y_train = merge_non_stratified_with_train(x_train, y_train, extra)

In [6]:
data.Team.value_counts()

ANOMALIES                                       968
SMARTS/GFP_CPE                                  684
GTAC                                            601
Unified Desktop (UD)                            524
GTAC - Account issues, password change          425
GTAC - Other                                    383
CISCO ISE - Network Device Update/Add/Delete    278
Global Delivery Data & Analytics                231
Cisco ISE                                       155
VitalNet Problems or Errors                     134
Cisco ISE - Other                                79
GTAC - Audit or GTAC report data                 77
TRUE (Ticket Rules Update Engine)                72
NagiosXI                                         66
Express Ticketing                                64
Name: Team, dtype: int64

In [None]:
import numpy as np
emb_path = 'att_hackathon/data/embeddings_mpnet.npy'
embedding = np.load(emb_path)

x_train = embedding[y_train.index]
x_test = embedding[y_test.index]

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
from catboost import CatBoostClassifier
#import xgboost as xgb
#classifier = xgb.XGBClassifier(n_jobs=8, n_estimators=100, verbosity=3)
classifier = CatBoostClassifier(task_type='GPU', iterations=500)
classifier.fit(x_train, y_train)

In [None]:
y_predicted = classifier.predict(x_test)
from sklearn.metrics import classification_report
print(classification_report(y_predicted, y_test))

In [7]:
### EXPERIMENTING WITH TFIDF
vect = TfidfVectorizer(strip_accents='unicode',
                      stop_words='english',
                      ngram_range=(1,5))
vect.fit(x_train.fillna('').Problem_Abstract)
x_train = vect.transform(x_train.fillna('').Problem_Abstract)
x_test = vect.transform(x_test.fillna('').Problem_Abstract)
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [9]:
y_train

array([ 6, 11,  6, ...,  6,  9, 11])

In [31]:
x_train.shape[0] + x_test.shape[0]

4741

In [11]:
from catboost import CatBoostClassifier
#import xgboost as xgb
#classifier = xgb.XGBClassifier(n_jobs=8, n_estimators=100, verbosity=3)
classifier = CatBoostClassifier(task_type='GPU', iterations=1000)
classifier.fit(x_train, y_train)

Learning rate set to 0.085187
0:	learn: 2.5107151	total: 225ms	remaining: 3m 44s
1:	learn: 2.3791692	total: 326ms	remaining: 2m 42s
2:	learn: 2.2816991	total: 422ms	remaining: 2m 20s
3:	learn: 2.2054026	total: 511ms	remaining: 2m 7s
4:	learn: 2.1408646	total: 609ms	remaining: 2m 1s
5:	learn: 2.0896905	total: 716ms	remaining: 1m 58s
6:	learn: 2.0431365	total: 794ms	remaining: 1m 52s
7:	learn: 1.9999342	total: 890ms	remaining: 1m 50s
8:	learn: 1.9637380	total: 972ms	remaining: 1m 46s
9:	learn: 1.9296656	total: 1.06s	remaining: 1m 45s
10:	learn: 1.9004365	total: 1.15s	remaining: 1m 43s
11:	learn: 1.8766982	total: 1.23s	remaining: 1m 41s
12:	learn: 1.8541737	total: 1.32s	remaining: 1m 40s
13:	learn: 1.8352655	total: 1.39s	remaining: 1m 37s
14:	learn: 1.8185605	total: 1.46s	remaining: 1m 35s
15:	learn: 1.7983230	total: 1.56s	remaining: 1m 36s
16:	learn: 1.7822716	total: 1.64s	remaining: 1m 35s
17:	learn: 1.7688666	total: 1.73s	remaining: 1m 34s
18:	learn: 1.7547517	total: 1.83s	remaining: 1

<catboost.core.CatBoostClassifier at 0x7f8a4e321e50>

In [13]:
y_predicted = classifier.predict(x_test)
from sklearn.metrics import classification_report
print(classification_report(y_predicted, y_test))

              precision    recall  f1-score   support

           0       0.81      0.52      0.63       305
           1       0.82      0.82      0.82        56
           2       0.58      0.75      0.65        24
           3       0.25      0.80      0.38         5
           4       0.31      1.00      0.47         4
           5       0.73      0.56      0.64       157
           6       0.41      0.61      0.49        57
           7       0.40      1.00      0.57         6
           8       0.45      0.66      0.54        53
           9       0.70      0.94      0.80        34
          10       0.62      0.89      0.73         9
          11       0.69      0.78      0.73       120
          12       0.50      1.00      0.67         7
          13       0.78      0.81      0.80       101
          14       0.41      1.00      0.58        11

    accuracy                           0.66       949
   macro avg       0.56      0.81      0.63       949
weighted avg       0.71   