In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
data_folder = '../datasets'

In [2]:
#_file = 'ML_DATASET_Hackathon_Supervised.csv'
_file = 'version_2.csv'

In [3]:
def load_file(path):
  df = pd.read_csv(path)
  return df

def create_anomaly_group(df, cutoff):
  df.loc[data['percentile'] > cutoff,'Team'] = 'ANOMALIES'
  return df

def get_non_stratifiable(df, count):
  counts = pd.DataFrame(df.Team.value_counts()).reset_index()
  non_stratifiable = list(counts[counts['Team'] < count]['index'])
  data_to_strat = df[~df.Team.isin(non_stratifiable)]
  extra = df[df.Team.isin(non_stratifiable)]
  return extra, data_to_strat

def merge_non_stratified_with_train(x_train, y_train, extra):
    x_train = pd.concat([x_train,extra[['Problem_Abstract']]], axis=0)
    y_train = pd.concat([y_train,extra[['Team']]], axis=0)
    return x_train, y_train

In [4]:
data = load_file(os.path.join(data_folder, _file))
data = data.fillna('')
data = create_anomaly_group(data, 0.8)
#extra, data_to_strat = get_non_stratifiable(data, 2)
x_train, x_test, y_train, y_test = train_test_split(data[['Problem_Abstract']], 
                                                    data[['Team']], 
                                                    test_size=0.2, 
                                                    stratify=data[['Team']])
#x_train, y_train = merge_non_stratified_with_train(x_train, y_train, extra)

In [5]:
data.Team.value_counts()

ANOMALIES                                       968
SMARTS/GFP_CPE                                  684
GTAC                                            601
Unified Desktop (UD)                            524
GTAC - Account issues, password change          425
GTAC - Other                                    383
CISCO ISE - Network Device Update/Add/Delete    278
Global Delivery Data & Analytics                231
Cisco ISE                                       155
VitalNet Problems or Errors                     134
Cisco ISE - Other                                79
GTAC - Audit or GTAC report data                 77
TRUE (Ticket Rules Update Engine)                72
NagiosXI                                         66
Express Ticketing                                64
Name: Team, dtype: int64

In [6]:
### EXPERIMENTING WITH TFIDF
vect = TfidfVectorizer(strip_accents='unicode',
                      stop_words='english',
                      ngram_range=(1,5))
vect.fit(x_train.fillna('').Problem_Abstract)
x_train = vect.transform(x_train.fillna('').Problem_Abstract)
x_test = vect.transform(x_test.fillna('').Problem_Abstract)
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
from catboost import CatBoostClassifier
#import xgboost as xgb
#classifier = xgb.XGBClassifier(n_jobs=8, n_estimators=100, verbosity=3)
classifier = CatBoostClassifier(task_type='GPU', iterations=500)
classifier.fit(x_train, y_train)

Learning rate set to 0.151751
0:	learn: 2.3885462	total: 358ms	remaining: 2m 58s
1:	learn: 2.2178224	total: 619ms	remaining: 2m 34s
2:	learn: 2.0976619	total: 780ms	remaining: 2m 9s
3:	learn: 2.0144240	total: 1.02s	remaining: 2m 7s
4:	learn: 1.9430293	total: 1.15s	remaining: 1m 54s
5:	learn: 1.8977370	total: 1.25s	remaining: 1m 42s
6:	learn: 1.8530776	total: 1.35s	remaining: 1m 35s
7:	learn: 1.8169623	total: 1.43s	remaining: 1m 27s
8:	learn: 1.7832002	total: 1.52s	remaining: 1m 23s
9:	learn: 1.7562547	total: 1.66s	remaining: 1m 21s
10:	learn: 1.7349771	total: 1.86s	remaining: 1m 22s
11:	learn: 1.7105458	total: 2.09s	remaining: 1m 24s
12:	learn: 1.6938744	total: 2.25s	remaining: 1m 24s
13:	learn: 1.6758172	total: 2.39s	remaining: 1m 23s
14:	learn: 1.6609677	total: 2.5s	remaining: 1m 20s
15:	learn: 1.6451891	total: 2.62s	remaining: 1m 19s
16:	learn: 1.6284875	total: 2.75s	remaining: 1m 18s
17:	learn: 1.6184977	total: 2.91s	remaining: 1m 17s
18:	learn: 1.6093154	total: 3.02s	remaining: 1m

In [None]:
classifier.save_model('../models/class_model.cbm')

In [None]:
y_predicted = classifier.predict(x_test)
from sklearn.metrics import classification_report
print(classification_report(y_predicted, y_test))

In [None]:
#possible extension with calibratedclassifier
from sklearn.calibration import CalibratedClassifierCV
base_clf = CatBoostClassifier(iterations=500)
calibrated_clf = CalibratedClassifierCV(base_clf, cv=3)
calibrated_clf.fit(x_train, y_train)

y_predicted = classifier.predict(x_test)
y_predicted_proba = classifier.predict_proba(x_test)


In [None]:
#alternative solution using language model all-mpnet-base-v2 (not as effective)
import numpy as np
emb_path = '../data/embeddings_mpnet_v2.npy'
embedding = np.load(emb_path)

x_train = embedding[y_train.index]
x_test = embedding[y_test.index]

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)