In [1]:
import pandas as pd
import seaborn as sns
import string
import warnings
import matplotlib.pyplot as plt
import pickle
import numpy as np
import spacy 
from spacy.lang.en import English
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
warnings.filterwarnings("ignore")

#### 1. Read in data

In [2]:
df = pd.read_csv('/home/jupyter/sb-entity-classification/data/data.csv')
df.columns = ['class','name']

# classes_list = pd.read_csv('/home/jupyter/sb-entity-classification/data/classes.txt', header = None)
# classes_list['class'] = classes_list.index
# classes_list.columns = ['class_name', 'class']
# classes_list['class'] = classes_list['class'] + 1  # based on information provided in the brief

# df = df.merge(classes_list, on = 'class', how = 'left')

####  2. Train, Validation, Test split

In [3]:
train, test = train_test_split(df, test_size = 0.2, random_state = 42, shuffle = True)

In [4]:
train.shape

(439029, 2)

In [5]:
train = train.sample(frac = 0.1)

In [6]:
train.shape

(43903, 2)

In [14]:
### save the split train, valid, and test set, for fair comparison between models
# train.to_pickle('/home/jupyter/sb-entity-classification/data/train.pkl')
# test.to_pickle('/home/jupyter/sb-entity-classification/data/test.pkl')

#### 3. Create features

##### 3.1. Simple statistical features already explored in EDA

In [7]:
def create_statistical_features(df):
    df_features = df.copy()

    df_features['num_chars'] = df['name'].str.len()
    df_features['num_words'] = df_features['name'].apply(lambda x: len(x.split()))

    count = lambda l1,l2: sum([1 for x in l1 if x in l2]) 
    df_features['num_punctuations']= df_features['name'].apply(lambda x: count(x,set(string.punctuation)))
    df_features['num_ascii']= df_features['name'].apply(lambda x: count(x,set(string.ascii_letters)))
    df_features['num_whitespace']= df_features['name'].apply(lambda x: count(x,set(string.whitespace)))
    df_features['num_digits']= df_features['name'].apply(lambda x: count(x,set(string.digits)))
    df_features['num_nonascii'] = df_features['num_chars']-df_features['num_punctuations']-df_features['num_digits']-df_features['num_ascii']- df_features['num_whitespace']
    df_features = df_features.drop(['num_whitespace', 'num_ascii'], axis = 1)
    
    return df_features

In [8]:
train_statistical_features = create_statistical_features(train)
all_cols = train_statistical_features.columns.tolist()
statistical_feature_cols = [x for x in all_cols if x not in ['class', 'name']]
train_statistical_features = train_statistical_features[statistical_feature_cols]

##### 3.2 keyword in brackets also explored in EDA

In [8]:
# based on common sense -
hand_code_keywords = ['company']+['musician', 'singer', 'writer', 'artist', 'author']\
+ ['footballer', 'cricketer', 'football', 'baseball', 'rugby', 'hockey'] +\
['politician'] +['ship'] + ['crater', 'river'] + ['horse'] +['plant']\
+['album', 'EP', 'soundtrack'] +['film'] +['novel', 'magazine', 'book', 'journal', 'play', 'comics', 'newspaper', 'manga']

In [9]:
def create_keyword_features(df):
    df_features = df.copy()
    df_features['within_brackets'] = df['name'].str.extract('.*\((.*)\).*')

    df_features_with_brackets = df_features[['within_brackets']].dropna(axis = 0)
    for keyword in hand_code_keywords:
        df_features_with_brackets['has_{}'.format(keyword)] = df_features_with_brackets['within_brackets']\
                                                                .apply(lambda x: keyword in x.lower() )

    df_features = df_features.drop('within_brackets', axis = 1)
    df_features_with_brackets = df_features_with_brackets.drop('within_brackets', axis = 1)

    df_features_with_brackets.replace({True:1, False:0}, inplace = True)
    df_features = df_features.merge(df_features_with_brackets, left_index = True, right_index = True, how = 'left')
    df_features.fillna(0, inplace = True)
    
    return df_features

In [10]:
train_keyword_features = create_keyword_features(train)
all_cols = train_keyword_features.columns.tolist()
keyword_feature_cols = [x for x in all_cols if x not in ['class', 'name']]
train_keyword_features = train_keyword_features[keyword_feature_cols]

##### 3.3 NER explored in EDA

In [7]:
nlp = spacy.load("en_core_web_sm")

In [8]:
def create_ner_features(df, NER_labels = None):
    
    df_features = df.copy()

    df_features['NER_labels'] = df_features['name'].apply(lambda x: [i.label_ for i in nlp(x).ents])
    df_features_NER = df_features[df_features['NER_labels'].apply(lambda x: len(x) !=0)][['NER_labels']]
    
    if NER_labels == None:
        all_NER_labels = set(np.concatenate(df_features_NER['NER_labels'].tolist()))
    else:
        all_NER_labels = NER_labels
        
    for ner in all_NER_labels:
        df_features_NER['ner_{}'.format(ner)] = df_features_NER['NER_labels'].apply(lambda x: ner in x )

    df_features = df_features.drop('NER_labels', axis = 1)
    df_features_NER = df_features_NER.drop('NER_labels', axis = 1)

    df_features_NER.replace({True:1, False:0}, inplace = True)
    df_features = df_features.merge(df_features_NER, left_index = True, right_index = True, how = 'left')
    df_features.fillna(0, inplace = True)
    
    return df_features, all_NER_labels

In [9]:
%%time
train_ner_features, NER_labels_trained = create_ner_features(train)
all_cols = train_ner_features.columns.tolist()
ner_feature_cols = [x for x in all_cols if x not in ['class', 'name']]
train_ner_features = train_ner_features[ner_feature_cols]

CPU times: user 5min 1s, sys: 175 ms, total: 5min 1s
Wall time: 5min 4s


#### 4. Prepare feature set

In [10]:
train = train.merge(train_ner_features,left_index = True, right_index = True, how = 'left')
#     train_statistical_features,left_index = True, right_index = True, how = 'left')\
# merge(train_keyword_features,left_index = True, right_index = True, how = 'left')
    
# .merge(train_ner_features,left_index = True, right_index = True, how = 'left')

In [11]:
X = train.drop(['name','class'], axis = 1)
y = train['class']

#### 5. Train xgboost

In [12]:
model = XGBClassifier(nthread = 3)
model.fit(X, y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=3, nthread=3, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

#### 6. Evaluate model

In [13]:
test = test.sample(frac = 0.1)

# test_statistical_features = create_statistical_features(test)[statistical_feature_cols]
# test_keyword_features = create_keyword_features(test)[keyword_feature_cols]

test_ner_features, _ = create_ner_features(test, NER_labels_trained)
test_ner_features = test_ner_features[ner_feature_cols]

In [14]:
# test = test.merge(test_statistical_features,left_index = True, right_index = True, how = 'left')\
#             .merge(test_keyword_features,left_index = True, right_index = True, how = 'left')\
#             .merge(test_ner_features,left_index = True, right_index = True, how = 'left')

test = test.merge(test_ner_features,left_index = True, right_index = True, how = 'left')

In [15]:
test.fillna(0, inplace = True)

In [16]:
X_test = test.drop(['name','class'], axis = 1)
y_test = test['class']

In [17]:
X_test

Unnamed: 0,ner_WORK_OF_ART,ner_QUANTITY,ner_EVENT,ner_ORG,ner_CARDINAL,ner_FAC,ner_NORP,ner_ORDINAL,ner_LANGUAGE,ner_PERSON,ner_PRODUCT,ner_MONEY,ner_TIME,ner_LOC,ner_LAW,ner_DATE,ner_GPE
285419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99980,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
516713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
329433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
483564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
519958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
y_pred = model.predict(X_test)

In [19]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 22.18%


In [20]:
from sklearn.metrics import classification_report

In [21]:
print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

           1      0.000     0.000     0.000       764
           2      0.265     0.724     0.388       778
           3      0.000     0.000     0.000       847
           4      0.246     0.017     0.032       814
           5      0.197     0.849     0.320       840
           6      0.594     0.132     0.217       808
           7      0.393     0.072     0.122       790
           8      0.772     0.238     0.364       768
           9      0.000     0.000     0.000       791
          10      0.243     0.031     0.055       806
          11      0.144     0.747     0.242       799
          12      0.376     0.073     0.122       771
          13      0.548     0.145     0.229       821
          14      0.000     0.000     0.000       579

    accuracy                          0.222     10976
   macro avg      0.270     0.216     0.149     10976
weighted avg      0.274     0.222     0.152     10976



In [48]:
classes_list = pd.read_csv('/home/jupyter/sb-entity-classification/data/classes.txt', header = None)
classes_list['class'] = classes_list.index
classes_list.columns = ['class_name', 'class']
classes_list['class'] = classes_list['class'] + 1  # based on information provided in the brief
classes_list

Unnamed: 0,class_name,class
0,Company,1
1,EducationalInstitution,2
2,Artist,3
3,Athlete,4
4,OfficeHolder,5
5,MeanOfTransportation,6
6,Building,7
7,NaturalPlace,8
8,Village,9
9,Animal,10
