In [1]:
import spacy
import nltk
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, VectorizerMixin
from utils import *

In [2]:
%reload_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_colwidth', 120)

In [3]:
# Note: you can add other languages that Spacy supports, or download
# larger models for english that Spacy offers. 
nlp = spacy.load('en') 

In [4]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

def stop_word_removal(li):    
    return [l for l in li if l not in ENGLISH_STOP_WORDS]

In [5]:
from textblob import TextBlob

def translation(s):
    blob = TextBlob(s)
    return format(blob.translate(to = 'en'))

In [6]:
from utils import clean_html
from sklearn.feature_extraction.text import strip_accents_unicode
import snowballstemmer

def clean_twitter(s):
    """ Cleans Twitter specific issues 
    
    Can you think of what else you might need to add here?
    """
    s = re.sub(r'@\w+', '', s) #remove @ mentions from tweets    
    return s

def clean_tweet(s):
    '''
    Utility function to clean tweet text by removing links, special characters
    using simple regex statements.
    '''
    return ' '.join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", s).split())
# (@[A-Za-z0-9]+)|

In [7]:
def preprocessor(s):
    """ For all basic string cleanup. 
    
    Think of what you can add to this to improve things. What is
    specific to your goal, how can you transform the text. Add tokens,
    remove things, unify things. 
    """
    s = clean_html(s)
    s = strip_accents_unicode(s.lower())
    #s = clean_twitter(s)
    s = clean_tweet(s)
    
    #stemmer = snowballstemmer.stemmer('english')
    #s = stemmer.stemWords(s.split())
    #s = ' '.join(s)

    return s

In [8]:
def cool_tokenizer(sent):
    """ Idea from Travis in class: adds a token to the end with nsubj and root verb!"""
    doc = nlp(sent)
    tokens = sorted(doc, key = lambda t: t.dep_)
    return ' '.join([t.lemma_ for t in tokens if t.dep_ in ['nsubj', 'ROOT']])

cool_tokenizer('a migrant died in crossing the river')

'die migrant'

In [9]:
from langdetect.lang_detect_exception import LangDetectException
from langdetect import detect

def dep_tokenizer(sent):
    """ A simple version of tokenzing with the dependencies.
    
    Note: this is monolingual! Also, it still doesn't take into 
    account correlations!
    """
    doc = nlp(sent)
    tokens = [t for t in doc if not t.is_stop and t.dep_ not in ['punct', '']]
    tokens = [':'.join([t.lemma_,t.dep_]) for t in tokens]
    return(tokens)

dep_tokenizer('a migrant died in crossing the river')

['migrant:nsubj', 'die:ROOT', 'cross:pcomp', 'river:dobj']

In [10]:
def num_tokenizer(sent):
    """ Idea that tweets with numerical words or numbers reflect death-tools or something similar!"""
    doc = nlp(sent)
    tokens = [t.pos_ for t in doc if t.pos_ in ['NUM']]
    if tokens != []:
        return('ONE')
    else: return('ZERO')

num_tokenizer('100 migrants died in crossing the river')

'ONE'

In [11]:
def sentiment_tokenizer(sent):
        '''
        Utility function to classify sentiment of passed tweet
        using textblob's sentiment method
        '''
        # create TextBlob object of passed tweet text
        analysis = TextBlob(clean_tweet(sent))
        # set sentiment
        if analysis.sentiment.polarity > 0:
            return 'POSITIVE'
        elif analysis.sentiment.polarity == 0:
            return 'NEUTRAL'
        else:
            return 'NEGATIVE'

In [12]:
import re

def analyzer(s, ngram_range = (1, 4)):
    
    """ Does everything to turn raw documents into tokens. """
    
    sentiment_token = [sentiment_tokenizer(s)]
    s = preprocessor(s)
    pattern = re.compile(r"(?u)\b\w\w+\b")
    unigrams = pattern.findall(s)
    #unigrams = stop_word_removal(unigrams)
    cool_token = [cool_tokenizer(s)]
    num_token = [num_tokenizer(s)]
    tokens = ngrammer(unigrams, ngram_range) + cool_token + num_token + sentiment_token  
    return tokens

In [13]:
#X = pd.read_csv('kaggle/train.csv').tweet
#for i in range(0, len(X)):
#    if clean_html(X[i]) != '':
#        if detect(X[i]) != 'en':
#            try:
#                X[i] = translation(X[i])
#            except:
#                pass

#X.to_csv('kaggle/X_eng.csv', header = 'tweet', index = False)

In [14]:
from sklearn.model_selection import train_test_split

X = pd.read_csv('kaggle/X_eng.csv', lineterminator = '\n').tweet
y = pd.read_csv('kaggle/train.csv').label

#X = pd.read_csv('kaggle/kaggle_train.csv').tweet
#y = pd.read_csv('kaggle/kaggle_train.csv').label

cutoff = 1750
X_train, X_test, y_train, y_test = X[0:cutoff], X[cutoff:], y[0:cutoff], y[cutoff:]

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [15]:
X_train.shape, X_test.shape

((1750,), (450,))

In [16]:
from scipy.sparse import coo_matrix, hstack

def create_vectors(X_train, X_test, analyzer = analyzer):
    """ Just a small helper function that applies the SKLearn Vectorizer with our analyzer """
    idx = X_train.shape[0]
    X = pd.concat([X_train, X_test])
    vectorizer = TfidfVectorizer(analyzer=analyzer).fit(X)
    #vectorizer = CountVectorizer(analyzer=analyzer).fit(X)
    vector = vectorizer.transform(X)

    return vector[0:idx], vector[idx:], vectorizer

In [17]:
def hot_coding_sentiment(X, matrix, sentiment):
    rows = matrix.shape[0]
    vector = np.zeros(rows, int)
    vector.shape = (rows, 1)
    for tweet in range(0, len(X)):
        if sentiment_tokenizer(X[tweet]) == sentiment:
            vector[tweet] = 1
    matrix = np.hstack([matrix, vector])
    return matrix

In [18]:
def hot_coding_numerical(X, matrix, numerical):
    rows = matrix.shape[0]
    vector = np.zeros(rows, int)
    vector.shape = (rows, 1)
    for tweet in range(0, len(X)):
        if num_tokenizer(X[tweet]) == numerical:
            vector[tweet] = 1
    matrix = np.hstack([matrix, vector])
    return matrix

In [19]:
def hot_coding(X, V):
    sentiment = ['positive', 'negative']
    numerical = ['one']
    for sen in sentiment:
        V = hot_coding_sentiment(X, V, sen)
    for num in numerical:
        V = hot_coding_numerical(X, V, num)
    return V

In [20]:
V_train, V_test, vectorizer = create_vectors(X_train, X_test)

In [None]:
#V_train = V_train.toarray()
#V_test = V_test.toarray()

#V_train = hot_coding(X_train, V_train)
#X_test_reset = X_test.reset_index().tweet
#V_test = hot_coding(X_test_reset, V_test)

# Estimation

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import recall_score, precision_score, f1_score, average_precision_score, roc_auc_score

model = MultinomialNB()
model.fit(V_train, y_train)
preds = model.predict_proba(V_test)[:,1]
roc_auc_score(y_test, preds)

In [None]:
model = BernoulliNB(class_prior=[0.5, 0.5])
model.fit(V_train, y_train)
preds = model.predict_proba(V_test)[:,1]
roc_auc_score(y_test, preds)

## SVM

In [None]:
from sklearn.svm import LinearSVC
import scipy.stats as st
from sklearn.model_selection import RandomizedSearchCV

one_to_left = st.beta(10, 1)
from_zero_positive = st.expon(0, 1)
params = {
    "C": from_zero_positive,
    "intercept_scaling": from_zero_positive,
}

model = LinearSVC(tol= 10e-6, max_iter= 1000, 
                  penalty= 'l2', loss= 'squared_hinge', 
                  dual= True, fit_intercept= True, 
                  random_state= None, class_weight= None, 
                  verbose= 0)
model = RandomizedSearchCV(model, params, n_jobs=1, n_iter=20)

model.fit(V_train, y_train)
preds = model.decision_function(V_test)
roc_auc_score(y_test, preds)

In [22]:
from sklearn.svm import LinearSVC

model = LinearSVC(tol = 10e-6, max_iter = 100,
                  penalty = 'l2', loss = 'squared_hinge', 
                  dual = True, C = 0.001, 
                  fit_intercept = True, intercept_scaling = 10,
                  verbose = 0, random_state = None, 
                  class_weight = None)

model.fit(V_train, y_train)
preds = model.decision_function(V_test)
roc_auc_score(y_test, preds)

0.8942948750572549

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

one_to_left = st.beta(10, 1)
from_zero_positive = st.expon(0, 1)
params = {
    "n_estimators": st.randint(50, 500),
    "max_depth": st.randint(1, 3),
    "learning_rate": st.uniform(0.01, 0.8),
    "subsample": one_to_left,
}

gbc = GradientBoostingClassifier()
gbc = RandomizedSearchCV(gbc, params, n_jobs=1, n_iter=5)

#gbc = GradientBoostingClassifier(n_estimators = 300, max_depth = 1, learning_rate = 0.1)
gbc.fit(V_train, y_train)
pred = gbc.predict(V_test)
gbc.predict_proba(V_test)
gbc.score(V_test, y_test)

#preds = gbc.decision_function(V_test)
#roc_auc_score(y_test, preds)

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

one_to_left = st.beta(10, 1)
from_zero_positive = st.expon(0, 1)
params = {
    "n_estimators": st.randint(50, 500),
    "max_depth": st.randint(1, 3),
    "learning_rate": st.uniform(0.01, 0.8),
    "colsample_bytree": one_to_left,
    "subsample": one_to_left,
    "gamma": st.uniform(0, 10),
    'reg_alpha': from_zero_positive,
    "min_child_weight": from_zero_positive,
}

xgbc = XGBClassifier(nthreads=-1)
xgbc = RandomizedSearchCV(xgbc, params, n_jobs=1, n_iter=10)
xgbc.fit(V_train, y_train)
pred = xgbc.predict(V_test)
xgbc.predict_proba(V_test)
xgbc.score(V_test, y_test)

In [None]:
# Look at your false predictions!
false_pos, false_neg = get_errors(X_test, y_test, preds)

## Submission!

Here you can make the submission required for Kaggle. 

In [None]:
#X_sub = pd.read_csv('kaggle/test.csv').tweet
#for i in range(0, len(X_sub)):
#    if clean_html(X_sub[i]) != '':
#        if detect(X_sub[i]) != 'en':
#            try:
#                X_sub[i] = translation(X_sub[i])
#            except:
#                pass

#X_sub.to_csv('kaggle/test_X_eng.csv', header = 'tweet', index = False)

In [23]:
X_sub = pd.read_csv('kaggle/test_X_eng.csv', lineterminator = '\n').tweet
id_sub = pd.read_csv('kaggle/test.csv').id

V_train, V_test, _ = create_vectors(X, X_sub)
#V_train = V_train.toarray()
#V_test = V_test.toarray()

#V_train = hot_coding(X_train, V_train)
#X_test_reset = X_test.reset_index().tweet
#V_test = hot_coding(X_test_reset, V_test)

In [24]:
model.fit(V_train, y)
preds = model.decision_function(V_test)
#gbc.fit(V_train, y)
#preds = gbc.decision_function(V_test)

#xgbc.fit(V_train, y)
#preds = xgbc.predict_proba(V_test)
#preds = [x[1] for x in preds]

In [25]:
write_submission_csv(preds, id_sub, 'kaggle/submission.csv')