In [1]:
import pandas as pd
import os
import spacy

In [2]:
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', -1)

In [3]:
# read dataset
tweets = pd.read_csv('../data/train.csv', encoding = 'utf8')

In [4]:
# lets explore a few tweets
print(len(tweets))
tweets.head(3)

7613


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1


In [5]:
#check for text information and make sure there are no weird characters
tweets['text'].iloc[550:570]

550    I added a video to a @YouTube playlist http://t.co/wedWyn9kfS World Of Tanks - Battle Assistant Mod Bat Chat Arti kaboom                  
551    YA BOY CLIP VS 4KUS FULL BATTLE\n\n@15MofeRadio @Heavybag201 @battle_dom @QOTRING @BattleRapChris @Hughes1128 \n\nhttps://t.co/7SPyDy1csc 
552    Indeed!! I am fully aware of that battle! I support you in that fight!!  https://t.co/MctJnZX4H8                                          
553    It's baaaack!  Petersen's Bowhunting Battle of the Bows.  Make sure you head on over and cast your vote for your... http://t.co/FJ73gDvg2n
554    #Tb #throwback ??\n\n??~ You want a battle? Here's a War! ~ ?? https://t.co/B0ZJWgmaIW                                                    
555    Kelby Tomlinson mild-mannered 2nd baseman for a great metropolitan team fights a never-ending battle for hits RBI and the #SFGiants way.  
556    Black Eye 9: A space battle occurred at Star M27329 involving 1 fleets totaling 1236 ships with 7 destroyed          

In [6]:
# the text has multiple links in the body so we need to define a function to delete those hyperlinks
import re
def remove_urls(text):
    # Remove urls taking into consideration http and https links
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"http?://[A-Za-z0-9./]+", ' ', text)
    return text

def remove_numbers(text):
    text = re.sub(r'\b\d+\b', ' ', text)
    return text

In [7]:
remove_urls(tweets['text'].iloc[69])

'Accident center lane blocked in #SantaClara on US-101 NB before Great America Pkwy #BayArea #Traffic  '

In [8]:
remove_numbers(tweets['text'].iloc[62])

'Rene Ablaze &amp; Jacinta - Secret 2k13 (Fallen Skies Edit) - Mar      https://t.co/7MLMsUzV1Z'

In [9]:
text_cleaned = tweets['text'].tolist()
text_cleaned = [remove_urls(x) for x in text_cleaned]
text_cleaned = [remove_numbers(x) for x in text_cleaned]

For this excersice I will explore more classic models such as tf-idf with logistic regression, svm and neural networks.

#### We start with tf-idf

We will use a Term Frequency- Inverse document frequency model as the baseline for this task. We will use the particularly the TfidfVectorizer from scikit-learn. The TfidfVectorizer expects a list of texts, so we need to transform the text column (the tweet column) into a list before feeding it to the model.

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfid_vectorizer = TfidfVectorizer(use_idf = True, 
                                  smooth_idf=True, sublinear_tf=True,
                                  encoding = 'utf-8', lowercase = False, 
                                  stop_words = 'english',
                                  token_pattern=r'\w{1,}',
                                  ngram_range = (1,3),
                                 max_features = 10000)


X_text = tfid_vectorizer.fit_transform(text_cleaned)

In [19]:
# split the dataset into test and training
from sklearn.model_selection import train_test_split
X_train,X_valid,y_train,y_valid = train_test_split(X_text,
                                                   tweets['target'].tolist(),
                                                   stratify = tweets['target'].tolist(),
                                                   test_size = 0.3, random_state = 42)


In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, accuracy_score

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_valid)
print("---Test Set Results---")
print("Accuracy with logreg: {}".format(accuracy_score(y_valid, y_pred)))
print("F1 with logreg: {}".format(f1_score(y_valid, y_pred, average = 'macro')))
print(classification_report(y_valid, y_pred))

---Test Set Results---
Accuracy with logreg: 0.8047285464098074
F1 with logreg: 0.7932575257569595
              precision    recall  f1-score   support

           0       0.78      0.91      0.84      1303
           1       0.85      0.66      0.74       981

    accuracy                           0.80      2284
   macro avg       0.82      0.79      0.79      2284
weighted avg       0.81      0.80      0.80      2284



In [25]:
# try the xgboost
import xgboost as xgb

xgb_model = xgb.XGBClassifier(max_depth=25, n_estimators=1000, learning_rate=0.10, nthread = 2, 
                              colsample_bytree=.7, gamma=0, reg_alpha=4, 
                              objective='binary:logistic', eta=0.3, silent=1, 
                              subsample=0.8).fit(X_train, y_train) 

xgb_prediction = xgb_model.predict(X_valid)


print('training f1 score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))
print('validation f1 score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))
print(classification_report(y_valid, xgb_prediction))

training f1 score: 0.9458612120830379
validation f1 score: 0.7292414449630739
              precision    recall  f1-score   support

           0       0.75      0.83      0.78      1303
           1       0.73      0.63      0.67       981

    accuracy                           0.74      2284
   macro avg       0.74      0.73      0.73      2284
weighted avg       0.74      0.74      0.74      2284



In [23]:
# make predictions
test_data = pd.read_csv('../data/test.csv', encoding = 'utf8')

In [26]:
test_data.head(5)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone."
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [34]:
len(test_data)

3263

In [35]:
# prepare data for submission
text_cleaned_submission = test_data['text'].tolist()
text_cleaned_submission = [remove_urls(x) for x in text_cleaned_submission]
text_cleaned_submission = [remove_numbers(x) for x in text_cleaned_submission]

In [36]:
# transform new clean data to 
X_text_submission = tfid_vectorizer.fit_transform(text_cleaned_submission)


In [37]:
xgb_prediction_submmision = xgb_model.predict(X_text_submission)

In [38]:
xgb_prediction_submmision.shape

(3263,)

In [39]:
# make new dataframe with predictions
test_data['target'] = xgb_prediction_submmision

In [40]:
test_data.head(2)

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,0
1,2,,,"Heard about #earthquake is different cities, stay safe everyone.",0


In [41]:
# write to csv
test_data[['id','target']].to_csv("../data/submission.csv", index=False)


In [None]:
X_train_sub,X_valid_sub,y_train,y_valid = train_test_split(X_text,
                                                   tweets['target'].tolist(),
                                                   stratify = tweets['target'].tolist(),
                                                   test_size = 0.3, random_state = 42)

In [None]:
# Take the text data and tokenize it. We need to make sure spacy gpu is working to 
# speed up calculations.
spacy.require_gpu()

In [None]:
#spacy.prefer_gpu()
nlp = spacy.load('en_core_web_lg')

# check the default components
print(nlp.pipeline)

In [None]:
%time tweets_posted=list(nlp.pipe(tweets['text'].tolist(), batch_size=512))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(tweets['text'].tolist())
print(vectorizer.get_feature_names())
print(X.shape)

In [None]:
tweets_posted[0:11]