In [1]:
%matplotlib inline
%config IPCompleter.greedy=True

import string
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

import re
import nltk
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/davidkolb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
test = pd.read_csv('../../data/nlpnovice/test.csv')
train = pd.read_csv('../../data/nlpnovice/train.csv')


In [4]:
train.describe, test.describe

(<bound method NDFrame.describe of          id keyword location  \
 0         1     NaN      NaN   
 1         4     NaN      NaN   
 2         5     NaN      NaN   
 3         6     NaN      NaN   
 4         7     NaN      NaN   
 ...     ...     ...      ...   
 7608  10869     NaN      NaN   
 7609  10870     NaN      NaN   
 7610  10871     NaN      NaN   
 7611  10872     NaN      NaN   
 7612  10873     NaN      NaN   
 
                                                    text  target  
 0     Our Deeds are the Reason of this #earthquake M...       1  
 1                Forest fire near La Ronge Sask. Canada       1  
 2     All residents asked to 'shelter in place' are ...       1  
 3     13,000 people receive #wildfires evacuation or...       1  
 4     Just got sent this photo from Ruby #Alaska as ...       1  
 ...                                                 ...     ...  
 7608  Two giant cranes holding a bridge collapse int...       1  
 7609  @aria_ahrary @TheTawniest

# Cleaing the Data

In [5]:
def remove_whitespace(data):
    return data.strip()

In [6]:
def remove_URL(data):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',data)

In [7]:
def remove_html(data):
    html=re.compile(r'<.*?>')
    return html.sub(r'',data)

In [8]:
def remove_emoji(data):
    emoji_pattern = re.compile('['
                           u'\U0001F600-\U0001F64F'  # emoticons
                           u'\U0001F300-\U0001F5FF'  # symbols & pictographs
                           u'\U0001F680-\U0001F6FF'  # transport & map symbols
                           u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
                           u'\U00002702-\U000027B0'
                           u'\U000024C2-\U0001F251'
                           ']+', flags=re.UNICODE)
    return emoji_pattern.sub(r'', data)

In [9]:
def remove_accented_chars(data):
    return unicodedata.normalize('NFKD', data).encode('ascii', 'ignore').decode('utf-8', 'ignore')

In [10]:
def remove_punctuation(data):
    table=str.maketrans('','',string.punctuation)
    return data.translate(table)

In [11]:
def single_char(data):
    new_data=''
    for w in data:
        print(w)
        if len(w) > 1:
            print(w)
            new_data = new_data +  ' ' + w
    
    return new_data

In [12]:
def remove_special_characters(data, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    data = re.sub(pattern, '', data)
    return data

In [13]:
def convert_lower_case(data):
    return data.lower()

In [14]:
def tokenise(data):
    data = word_tokenize(data)
    return data

In [15]:
def remove_stop_words(data):
    filtered_sentence = [w for w in data if not w in stop_words] 
    return filtered_sentence

In [16]:
def stemming(data):
    data.apply(lambda x: [stemmer.stem(e) for e in x])
    return data

In [17]:
def lemmatise(data):
    lemmatizer = WordNetLemmatizer()
    data.apply(lambda x: [lemmatizer.lemmatize(e) for e in x])
    return data 

In [18]:
def nlp_clean(data):
    data = remove_URL(data)
    data = remove_html(data)
    data = remove_emoji(data)
    data = remove_whitespace(data)    
    data = remove_accented_chars(data) 
    data = remove_special_characters(data)
    data = remove_punctuation(data)
    data = convert_lower_case(data)
    return data
    
def nlp_tokenise(data):
    stop_words = set(stopwords.words('english'))
    data = tokenise(data)
    data = remove_stop_words(data)
    return data
     
def nlp_normalise(data):
    stemmer = SnowballStemmer('english')
    data = stemming(data)
    data = lemmatise(data)
    return data

In [19]:
# Run preprossing steps to clean data
train['text']=train['text'].apply(lambda x : nlp_clean(x))
test['text']=test['text'].apply(lambda x : nlp_clean(x))

In [20]:
# Run steps to remove stop words
train['text']=train['text'].apply(lambda x : nlp_tokenise(x))
test['text']=test['text'].apply(lambda x : nlp_tokenise(x))

In [21]:
# rejoin Data after tokenisation 
def combine_text(list_of_text):
    combined_text = ''
    for word in list_of_text:
        combined_text = combined_text + ' ' + word
    return combined_text

In [22]:
train['text'] = train['text'].apply(lambda x : combine_text(x))
test['text'] = test['text'].apply(lambda x : combine_text(x))

In [23]:
train['text'].head(30)

0          deeds reason earthquake may allah forgive us
1                 forest fire near la ronge sask canada
2      residents asked shelter place notified office...
3      13000 people receive wildfires evacuation ord...
4      got sent photo ruby alaska smoke wildfires po...
5      rockyfire update california hwy 20 closed dir...
6      flood disaster heavy rain causes flash floodi...
7                            im top hill see fire woods
8      theres emergency evacuation happening buildin...
9                         im afraid tornado coming area
10                      three people died heat wave far
11     haha south tampa getting flooded hah wait sec...
12     raining flooding florida tampabay tampa 18 19...
13                      flood bago myanmar arrived bago
14        damage school bus 80 multi car crash breaking
15                                            whats man
16                                          love fruits
17                                        summer

In [24]:
test['text'].head(30)

0                           happened terrible car crash
1      heard earthquake different cities stay safe e...
2      forest fire spot pond geese fleeing across st...
3                 apocalypse lighting spokane wildfires
4                typhoon soudelor kills 28 china taiwan
5                                 shakingits earthquake
6      theyd probably still show life arsenal yester...
7                                                   hey
8                                              nice hat
9                                                  fuck
10                                       dont like cold
11                                      nooooooooo dont
12                                            dont tell
13                                                     
14                                              awesome
15     birmingham wholesale market ablaze bbc news f...
16               sunkxssedharry wear shorts race ablaze
17     previouslyondoyintv toke makinwauas marri

In [25]:
# train.to_csv('../../Data/NLPNovice/DSKtrain.csv', index=False)
# test.to_csv('../../Data/NLPNovice/DSKtest.csv', index=False)

# Load pre cleaned text files

In [26]:
# Load pre cleaned text files
# train = pd.read_csv('../../Data/NLPNovice/DSKtrain.csv')
# test = pd.read_csv('../../Data/NLPNovice/DSKtest.csv')

# Convert text column from object to string
train['text'] = train['text'].apply(lambda x : str(x))
test['text'] = test['text'].apply(lambda x : str(x))

#Shuffle Data 
train = train.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)

X = train['text']
y = train['target']

#Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [27]:
# SDG 
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.7521891418563923
              precision    recall  f1-score   support

           0       0.70      0.98      0.82      1305
           1       0.94      0.45      0.61       979

    accuracy                           0.75      2284
   macro avg       0.82      0.71      0.71      2284
weighted avg       0.80      0.75      0.73      2284



In [28]:
# naive_bayes
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.7994746059544658
              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1305
           1       0.86      0.64      0.73       979

    accuracy                           0.80      2284
   macro avg       0.81      0.78      0.79      2284
weighted avg       0.81      0.80      0.79      2284



# Predict and Submit Final Model

In [None]:
predict=model.predict(test_input)
print(predict)

In [None]:
predict=np.round(predict).astype(int).reshape(3263)
sub=pd.DataFrame({'id':test['id'].values.tolist(),'target':predict})

In [None]:
sub.to_csv('DSKsubmission.csv', index=False)

In [None]:
!kaggle competitions submit -c nlp-getting-started -f DSKsubmission.csv -m 'DSK NLP with Keras'

In [0]:
import pandas as pd
sample_submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
