In [2]:
import nltk
import pandas as pd
import numpy as np

In [3]:
train_df = pd.read_csv("data/nlp-getting-started/train.csv")
test_df = pd.read_csv("data/nlp-getting-started/test.csv")

In [4]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
train_df.groupby(['target']).count()

Unnamed: 0_level_0,id,keyword,location,text
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4342,4323,2884,4342
1,3271,3229,2196,3271


In [57]:
train_df = train_df.astype(str)

## Feature extractor

### CountVectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [97]:
vec = CountVectorizer()

In [98]:
data_vec = vec.fit_transform(train_df['text']).toarray()

In [101]:
vec.get_feature_names()

['00',
 '000',
 '0000',
 '007npen6lg',
 '00cy9vxeff',
 '00end',
 '00pm',
 '01',
 '02',
 '0215',
 '02elqlopfk',
 '02pm',
 '03',
 '030',
 '033',
 '034',
 '039',
 '03l7nwqdje',
 '04',
 '05',
 '05th',
 '06',
 '060',
 '061',
 '06jst',
 '07',
 '073izwx0lb',
 '08',
 '0840728',
 '0853',
 '087809233445',
 '0880',
 '08lngclzsj',
 '09',
 '0abgfglh7x',
 '0ajisa5531',
 '0blkwcupzq',
 '0btniwagt1',
 '0bvk5tub4j',
 '0c1y8g7e9p',
 '0cr74m1uxm',
 '0cxm5tkz8y',
 '0dqjeretxu',
 '0drqlrsgy5',
 '0dxvz7fdh3',
 '0erisq25kt',
 '0f8xa4ih1u',
 '0fekgyby5f',
 '0fs9ksv5xk',
 '0ghk693egj',
 '0gidg9u45j',
 '0gknpy4lua',
 '0h7oua1pns',
 '0iw6drf5x9',
 '0iyuntxduv',
 '0jfnvaxfph',
 '0jmkdtcymj',
 '0kccg1bt06',
 '0keh2treny',
 '0krw1zyahm',
 '0l',
 '0la1aw9uud',
 '0llwuqn8vg',
 '0lmheaex9k',
 '0lpu0gr2j0',
 '0m1tw3datd',
 '0mcxc68gzd',
 '0migwcmtje',
 '0mnpcer9no',
 '0npzp',
 '0nr4dpjgyl',
 '0oms8ri3l1',
 '0pamznyyuw',
 '0q040stkcv',
 '0r03c6njli',
 '0rny349unt',
 '0rokdutyun',
 '0rsverlztm',
 '0s6ydfrwdq',
 '0sa6xx1o

In [102]:
data_vec

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
vec2 = CountVectorizer(analyzer='word', ngram_range=(1,2))

In [10]:
data_vec2 = vec2.fit_transform(train_df['text']).toarray()

### TfidfVectorizer

tf: term-frequency
tf-idf: term-frequency times inverse document-frequency

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [58]:
tfidf_vectorizer = TfidfVectorizer()
data_vec_weighted = tfidf_vectorizer.fit_transform(train_df['text'])

In [59]:
data_vec_weighted

<7613x21637 sparse matrix of type '<class 'numpy.float64'>'
	with 111497 stored elements in Compressed Sparse Row format>

In [60]:
feature_names = tfidf_vectorizer.get_feature_names()

In [61]:
tfidf_df = pd.DataFrame(data=data_vec_weighted.toarray(), columns=feature_names)

In [85]:
df = pd.concat([train_df[['id', 'keyword', 'location', 'target']].rename(columns={'id': '_id', 'keyword': '_keyword', 'location': '_location', 'target': '_target'}), tfidf_df], axis=1)

## Training Classifier

In [68]:
from catboost import Pool, CatBoostClassifier

In [86]:
df = df.sample(frac=1, random_state=24)

In [87]:
from sklearn.model_selection import train_test_split

df.reset_index(inplace=True, drop=True)
train, val = train_test_split(df, test_size=0.2, random_state=49)

features = [col for col in df.columns if col!= '_target']
target_col = '_target'

x_train = train[features]
y_train = train[target_col]
x_val = val[features]
y_val = val[target_col]

categorical_features_indices = np.where(X.dtypes != np.float)[0]

train_pool = Pool(x_train, y_train, cat_features=categorical_features_indices)
val_pool = Pool(x_val, y_val, cat_features=categorical_features_indices)

In [94]:
cat_clf = CatBoostClassifier(loss_function='Logloss',
                            verbose=20,
                            use_best_model=True,
                            od_type='Iter',
                            od_wait=50)

cat_clf.fit(X=train_pool, eval_set=val_pool)

Learning rate set to 0.049512
0:	learn: 0.6792188	test: 0.6774998	best: 0.6774998 (0)	total: 50.5ms	remaining: 50.5s
20:	learn: 0.5627976	test: 0.5415061	best: 0.5415061 (20)	total: 966ms	remaining: 45s
40:	learn: 0.5298941	test: 0.5058658	best: 0.5058658 (40)	total: 1.88s	remaining: 43.9s
60:	learn: 0.5130226	test: 0.4912373	best: 0.4912373 (60)	total: 2.8s	remaining: 43.1s
80:	learn: 0.5020851	test: 0.4836496	best: 0.4835681 (79)	total: 3.8s	remaining: 43.1s
100:	learn: 0.4929976	test: 0.4786504	best: 0.4786504 (100)	total: 4.81s	remaining: 42.8s
120:	learn: 0.4851662	test: 0.4743859	best: 0.4743859 (120)	total: 5.73s	remaining: 41.6s
140:	learn: 0.4784747	test: 0.4717402	best: 0.4717402 (140)	total: 6.65s	remaining: 40.5s
160:	learn: 0.4715533	test: 0.4694419	best: 0.4694419 (160)	total: 7.6s	remaining: 39.6s
180:	learn: 0.4643536	test: 0.4669220	best: 0.4669220 (180)	total: 8.5s	remaining: 38.5s
200:	learn: 0.4572542	test: 0.4647251	best: 0.4647251 (200)	total: 9.46s	remaining: 37.

<catboost.core.CatBoostClassifier at 0x3f4a4d690>

In [95]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

def model_performance(model, X_train, X_val, y_train, y_val):
    train_proba = model.predict_proba(X_train)
    val_proba = model.predict_proba(X_val)
    train_roc_auc = roc_auc_score(y_train, train_proba[:, 1])
    val_roc_auc = roc_auc_score(y_val, val_proba[:, 1])
    print('Roc_auc (Training set): %0.4f, Roc_auc (Validation set): %0.4f'%(train_roc_auc, val_roc_auc))
    
    predictions_cat = model.predict(X_val)
    print(confusion_matrix(y_val, predictions_cat))
    print(classification_report(y_val, predictions_cat))

In [96]:
model_performance(cat_clf, x_train, x_val, y_train, y_val)

Roc_auc (Training set): 0.9521, Roc_auc (Validation set): 0.8614
[[766 125]
 [187 445]]
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       891
           1       0.78      0.70      0.74       632

    accuracy                           0.80      1523
   macro avg       0.79      0.78      0.79      1523
weighted avg       0.79      0.80      0.79      1523



In [104]:
val_proba = cat_clf.predict_proba(x_val)

In [106]:
x_val

array([[0.34598101, 0.65401899],
       [0.86649608, 0.13350392],
       [0.85493877, 0.14506123],
       ...,
       [0.80269525, 0.19730475],
       [0.15577207, 0.84422793],
       [0.79080868, 0.20919132]])