In [1]:
import nltk
import string
import spacy
import pandas as pd
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.metrics import accuracy_score
from spacy.lang.en import English
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 

In [2]:
def clean_text_and_get_tokens( text ):
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word

    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words

    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    
    return words

In [3]:


# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    sentence = sentence.lower()
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [4]:
train_df = pd.read_csv( 'train.csv')

In [5]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
train_df[ 'word_tokens' ] = train_df[ 'text' ].apply(clean_text_and_get_tokens).apply(lambda x : spacy_tokenizer( ' '.join( x ) ) )

In [7]:
pd.set_option('display.max_columns', 500)

In [8]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target,word_tokens
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deeds, reason, earthquake, allah, forgive]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[residents, asked, shelter, place, notified, o..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[people, receive, wildfires, evacuation, order..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [9]:
def update_word_count( x ):
    
    x['word_counts' ]['id' ] = x['id' ]
    x['word_counts' ]['target' ] = x['target']
    return x['word_counts'] 

In [10]:
train_df[ 'word_counts' ] = train_df[ 'word_tokens' ].apply( lambda x : Counter( x ) )

In [11]:
train_df[ 'word_counts' ] = train_df.apply( update_word_count , axis = 1 )

In [12]:
new_train_df = pd.DataFrame( data = train_df[ 'word_counts' ].tolist() )

In [13]:
new_train_df.columns

Index(['aa', 'aaaa', 'aaaaaaallll', 'aaaaaand', 'aaarrrgghhh', 'aaceorg',
       'aan', 'aannnnd', 'aar', 'aaronthefm',
       ...
       'zxathetis', 'zzzz', 'å', 'åç', 'åè', 'åê', 'åêfedex', 'åêi', 'ìñ',
       'ìü'],
      dtype='object', length=16475)

In [14]:
input_cols = new_train_df.columns.difference( [ 'id' , 'target' ])

In [15]:
input_cols

Index(['aa', 'aaaa', 'aaaaaaallll', 'aaaaaand', 'aaarrrgghhh', 'aaceorg',
       'aan', 'aannnnd', 'aar', 'aaronthefm',
       ...
       'zxathetis', 'zzzz', 'å', 'åç', 'åè', 'åê', 'åêfedex', 'åêi', 'ìñ',
       'ìü'],
      dtype='object', length=16473)

In [16]:
op_cols = ['target' ]
input_df = new_train_df[ input_cols ] 
op_df    = new_train_df[ op_cols ]

In [17]:

trainX , testX , trainY , testY = train_test_split( input_df , op_df )

In [18]:
lr = LogisticRegression()

In [19]:
trainX.fillna(0 , inplace = True )

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [20]:
lr.fit(trainX , trainY)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
testX.fillna( 0 , inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [22]:
predicted_vals = lr.predict( testX )

In [23]:
accuracy = accuracy_score( testY , predicted_vals )

In [24]:
accuracy

0.8146008403361344

In [63]:
imp_metrics = [ i for i in zip( input_cols , lr.coef_[0] ) ]

In [64]:
imp_metrics.sort( key = lambda x : x[ 1 ] )

In [74]:
non_disaster_tweets_words = imp_metrics[:20]

In [75]:
non_disaster_tweets_words

[('nowplaying', -1.3439531071486135),
 ('bags', -1.175910738037093),
 ('selfies', -1.148446908109953),
 ('ebay', -1.1453302628806366),
 ('ruin', -1.1063293532174148),
 ('jobs', -1.105124525098807),
 ('buy', -1.0978928862201487),
 ('harm', -1.0692517380111763),
 ('blight', -1.0434767202505433),
 ('new', -1.0148660863317818),
 ('love', -1.0065728787022703),
 ('words', -0.994019859796408),
 ('better', -0.9893192653528573),
 ('wedding', -0.9713373735762837),
 ('upheaval', -0.9530280699326352),
 ('demolish', -0.9477106280909137),
 ('explode', -0.9371885069794118),
 ('jeb', -0.9269692376438363),
 ('poll', -0.9193962452804353),
 ('super', -0.9181523475927672)]

In [76]:
disaster_words = imp_metrics[-20:]

In [77]:
disaster_words

[('incident', 1.442926337411464),
 ('japan', 1.4465114712884186),
 ('tornado', 1.4773213477261034),
 ('suicide', 1.4822220665057921),
 ('bioterrorism', 1.5004548918602252),
 ('casualties', 1.5073270667301455),
 ('drought', 1.512999576595486),
 ('bombing', 1.5751505905525873),
 ('plane', 1.5971875060271472),
 ('train', 1.6272042698970048),
 ('floods', 1.6615171979314178),
 ('massacre', 1.7060807011006804),
 ('typhoon', 1.7111439876812733),
 ('wildfire', 1.7394730583007147),
 ('killed', 1.7490546668561255),
 ('storm', 1.7764513259934207),
 ('evacuated', 1.8985881304842354),
 ('earthquake', 1.917026791577826),
 ('fires', 2.1687474886959195),
 ('hiroshima', 2.6090238483002857)]