In [None]:
import pandas as pd
import seaborn as sns
import string
import warnings
import matplotlib.pyplot as plt
import pickle
import numpy as np
import spacy 
from spacy.lang.en import English
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
warnings.filterwarnings("ignore")

#### 1. Read in data

In [None]:
df = pd.read_csv('/home/jupyter/sb-entity-classification/data/data.csv')
df.columns = ['class','name']

####  2. Train, Validation, Test split

In [None]:
train, test = train_test_split(df, test_size = 0.2, random_state = 42, shuffle = True)

#### 3. Create features

##### 3.1. Simple statistical features already explored in EDA

In [None]:
def create_statistical_features(df):
    df_features = df.copy()

    df_features['num_chars'] = df['name'].str.len()
    df_features['num_words'] = df_features['name'].apply(lambda x: len(x.split()))

    count = lambda l1,l2: sum([1 for x in l1 if x in l2]) 
    df_features['num_punctuations']= df_features['name'].apply(lambda x: count(x,set(string.punctuation)))
    df_features['num_ascii']= df_features['name'].apply(lambda x: count(x,set(string.ascii_letters)))
    df_features['num_whitespace']= df_features['name'].apply(lambda x: count(x,set(string.whitespace)))
    df_features['num_digits']= df_features['name'].apply(lambda x: count(x,set(string.digits)))
    df_features['num_nonascii'] = df_features['num_chars']-df_features['num_punctuations']-df_features['num_digits']-df_features['num_ascii']- df_features['num_whitespace']
    df_features = df_features.drop(['num_whitespace', 'num_ascii'], axis = 1)
    
    return df_features

In [None]:
train_statistical_features = create_statistical_features(train)
all_cols = train_statistical_features.columns.tolist()
statistical_feature_cols = [x for x in all_cols if x not in ['class', 'name']]
train_statistical_features = train_statistical_features[statistical_feature_cols]

##### 3.2 keyword in brackets also explored in EDA

In [16]:
# based on common sense -
hand_code_keywords = ['company']+['musician', 'singer', 'writer', 'artist', 'author']\
+ ['footballer', 'cricketer', 'football', 'baseball', 'rugby', 'hockey'] +\
['politician'] +['ship'] + ['crater', 'river'] + ['horse'] +['plant']\
+['album', 'EP', 'soundtrack'] +['film'] +['novel', 'magazine', 'book', 'journal', 'play', 'comics', 'newspaper', 'manga']

In [17]:
def create_keyword_features(df):
    df_features = df.copy()
    df_features['within_brackets'] = df['name'].str.extract('.*\((.*)\).*')

    df_features_with_brackets = df_features[['within_brackets']].dropna(axis = 0)
    for keyword in hand_code_keywords:
        df_features_with_brackets['has_{}'.format(keyword)] = df_features_with_brackets['within_brackets']\
                                                                .apply(lambda x: keyword in x.lower() )

    df_features = df_features.drop('within_brackets', axis = 1)
    df_features_with_brackets = df_features_with_brackets.drop('within_brackets', axis = 1)

    df_features_with_brackets.replace({True:1, False:0}, inplace = True)
    df_features = df_features.merge(df_features_with_brackets, left_index = True, right_index = True, how = 'left')
    df_features.fillna(0, inplace = True)
    
    return df_features

In [18]:
train_keyword_features = create_keyword_features(train)
all_cols = train_keyword_features.columns.tolist()
keyword_feature_cols = [x for x in all_cols if x not in ['class', 'name']]
train_keyword_features = train_keyword_features[keyword_feature_cols]

##### 3.3 NER explored in EDA

In [19]:
nlp = spacy.load("en_core_web_sm")

In [20]:
def create_ner_features(df, NER_labels = None):
    
    df_features = df.copy()

    df_features['NER_labels'] = df_features['name'].apply(lambda x: [i.label_ for i in nlp(x).ents])
    df_features_NER = df_features[df_features['NER_labels'].apply(lambda x: len(x) !=0)][['NER_labels']]
    
    if NER_labels == None:
        all_NER_labels = set(np.concatenate(df_features_NER['NER_labels'].tolist()))
    else:
        all_NER_labels = NER_labels
        
    for ner in all_NER_labels:
        df_features_NER['ner_{}'.format(ner)] = df_features_NER['NER_labels'].apply(lambda x: ner in x )

    df_features = df_features.drop('NER_labels', axis = 1)
    df_features_NER = df_features_NER.drop('NER_labels', axis = 1)

    df_features_NER.replace({True:1, False:0}, inplace = True)
    df_features = df_features.merge(df_features_NER, left_index = True, right_index = True, how = 'left')
    df_features.fillna(0, inplace = True)
    
    return df_features, all_NER_labels

In [None]:
%%time
train_ner_features, NER_labels_trained = create_ner_features(train)
all_cols = train_ner_features.columns.tolist()
ner_feature_cols = [x for x in all_cols if x not in ['class', 'name']]
train_ner_features = train_ner_features[ner_feature_cols]

#### 4. Prepare feature set

In [None]:
train = train.merge(train_statistical_features,left_index = True, right_index = True, how = 'left')\
.merge(train_keyword_features,left_index = True, right_index = True, how = 'left')\
.merge(train_ner_features,left_index = True, right_index = True, how = 'left')

In [None]:
X = train.drop(['name','class'], axis = 1)
y = train['class']

#### 5. Train xgboost

In [None]:
model = XGBClassifier()
model.fit(X, y)

#### 6. Evaluate model

In [None]:
# test = test.sample(frac = 0.1)

test_statistical_features = create_statistical_features(test)[statistical_feature_cols]
test_keyword_features = create_keyword_features(test)[keyword_feature_cols]

test_ner_features, _ = create_ner_features(test, NER_labels_trained)
test_ner_features = test_ner_features[ner_feature_cols]

In [None]:
test = test.merge(test_statistical_features,left_index = True, right_index = True, how = 'left')\
            .merge(test_keyword_features,left_index = True, right_index = True, how = 'left')\
            .merge(test_ner_features,left_index = True, right_index = True, how = 'left')

In [None]:
test.fillna(0, inplace = True)

In [None]:
X_test = test.drop(['name','class'], axis = 1)
y_test = test['class']

In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))