In [24]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack

In [26]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [27]:
X=train.drop(columns=['target'])
y=train['target']

In [28]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [29]:
X_train.head()

Unnamed: 0,id,keyword,location,text
5836,8339,rubble,"Dallas, Tejas",Photo: postapocalypticflimflam: Prodding aroun...
30,44,,,The end!
1879,2700,crush,,Man crush everyday ???? @CristianInspire http...
6852,9820,trauma,I rap to burn shame.,@PTSD_Chat Yes. I feel the root of that is Sha...
2673,3835,detonate,"Sharkatraz/Bindle's Cleft, PA",@AutoAmes everyone hoped we would join ISIS an...


In [30]:
X_train=X_train.fillna('empty')
X_test=X_test.fillna('empty')

In [31]:
vectorizer_keyword = CountVectorizer(max_features=200)
vectorizer_location = CountVectorizer(max_features=200)


In [32]:
X_train_location = vectorizer_location.fit_transform(X_train['location'])
X_test_location  = vectorizer_location.transform(X_test['location'])

In [33]:
X_train_keyword = vectorizer_keyword.fit_transform(X_train['keyword'])
X_test_keyword = vectorizer_keyword.fit_transform(X_test['keyword'])

In [34]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=2000)

In [35]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['text'])
X_test_tfidf= tfidf_vectorizer.transform(X_test['text'])

In [36]:
X_train_keyword.shape

(5100, 200)

In [37]:
X_train_total=hstack([X_train_location,X_train_keyword,X_train_tfidf]).toarray()
X_test_total=hstack([X_test_location,X_test_keyword,X_test_tfidf]).toarray()

In [38]:
train_data=xgb.DMatrix(data=X_train_total,label=y_train)
validation_data=xgb.DMatrix(data=X_test_total,label=y_test)


In [39]:
param={
    'learning_rate':0.2,  
    'max_depth':12,
    'min_child_weight':5,
    'gamma':0.4
    
}

In [40]:
bst = xgb.train(param, train_data)

In [41]:
y_pred=bst.predict(validation_data)

In [42]:
y_pred[y_pred>=0.5]=1
y_pred[y_pred<0.5]=0

In [43]:
accuracy_score(y_test,y_pred)

0.724233983286908

In [71]:
x1 = xgb.DMatrix(X_train_total[8:10])
pred=bst.predict(x1)

In [72]:
pred

array([0.27587563, 0.27587563], dtype=float32)

In [46]:
y_train[3:5]

6852    1
2673    0
Name: target, dtype: int64

In [66]:
print(X_train.iloc[9]['text'])
print(y_train.iloc[9])

@ErinMariefishy everyone is setting flames upon me
0


In [54]:
y_train.value_counts()

0    2896
1    2204
Name: target, dtype: int64

In [55]:
y_train[0:10]

5836    0
30      0
1879    0
6852    1
2673    0
2818    1
4522    0
5500    0
5198    0
3861    0
Name: target, dtype: int64

In [48]:
import joblib

In [49]:
joblib.dump(vectorizer_keyword,'vectorizer_keyword.pkl')

['vectorizer_keyword.pkl']

In [50]:
joblib.dump(vectorizer_location,'vectorizer_location.pkl')

['vectorizer_location.pkl']

In [52]:
joblib.dump(tfidf_vectorizer,'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [51]:
joblib.dump(bst,'model.pkl')

['model.pkl']