In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
import re
from collections import Counter

In [3]:
tweets = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')

In [4]:
tweets.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
tweets.shape

(7613, 5)

In [6]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [7]:
len(tweets.keyword.unique())

222

In [8]:
tweets.location.unique()

array([nan, 'Birmingham', 'Est. September 2012 - Bristol', ...,
       'Vancouver, Canada', 'London ', 'Lincoln'], dtype=object)

In [9]:
tweets.text[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [10]:
from nltk.corpus import stopwords
#nltk.download("stopwords")
from nltk.stem import WordNetLemmatizer
from nltk import PorterStemmer


lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextProcessor(BaseEstimator, TransformerMixin):
    def __init__(self, remove_punc=True, to_lower=True, remove_num=True, remove_stopwords=True, lemmatize=True):
        self.remove_punc=remove_punc
        self.to_lower=to_lower
        self.remove_num=remove_num
        self.remove_stopwords = remove_stopwords
        self.lemmatize=lemmatize
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed =[]
        for text in X:
            if self.remove_punc:
                text = re.sub(r'\W+',' ', text)
            if self.to_lower:
                text = text.lower()
            if self.remove_num:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', ' num ', text)
            words = text.split(' ')
            filtered_text = []
            if self.remove_stopwords:
                for word in words:
                    if word not in stop_words:
                        filtered_text.append(word)
            if self.lemmatize:
                lem_words = []
                for word in filtered_text:
                    word_lemma = lemmatizer.lemmatize(word)
                    if not word_lemma in lem_words:
                        lem_words.append(word_lemma)
                text = lem_words
            X_transformed.append(text)
        return np.array(X_transformed)
                    

In [12]:
class TextCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, lower_case=True, remove_punctuation=True, remove_numbers=True, remove_urls=True, lemmatizing=True, remove_stopwords=True, stemming=True):
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.remove_numbers = remove_numbers
        self.lemmatizing = lemmatizing
        self.remove_stopwords = remove_stopwords
        self.stemming = stemming
        self.remove_urls = remove_urls
        
    def fit(self,X ,y=None):
        return self
    
    def transform(self, X, y=None):
        X_processed = []
        for text in X:
            if self.lower_case:
                text = text.lower()
            if self.remove_urls:
                text = re.sub(r'http?:.*(?=\s)',' url ', text)
            if self.remove_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', ' ', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            if self.remove_stopwords:
                text = [word for word in text.split() if not word in stop_words]
            word_counts = Counter(text)
            if self.stemming:
                stem_words = Counter()
                for word,count in word_counts.items():
                    word_stem = stemmer.stem(word)
                    stem_words[word_stem]+=count
                word_counts = stem_words
            X_processed.append(word_counts)
        return np.array(X_processed)

In [13]:
text_processor1 = TextProcessor()
data = text_processor1.fit_transform(tweets.text)



In [14]:
data

array([list(['deed', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'u']),
       list(['forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada']),
       list(['resident', 'asked', 'shelter', 'place', 'notified', 'officer', 'evacuation', 'order', 'expected']),
       ...,
       list(['num', '', 'utc', 'km', 'volcano', 'hawaii', 'http', 'co', 'zdtoyd', 'ebj']),
       list(['police', 'investigating', 'e', 'bike', 'collided', 'car', 'little', 'portugal', 'rider', 'suffered', 'serious', 'non', 'life', 'threatening', 'injury', '']),
       list(['latest', 'home', 'razed', 'northern', 'california', 'wildfire', 'abc', 'news', 'http', 'co', 'ymy', 'num', 'rskq'])],
      dtype=object)

In [15]:
keywords = tweets.keyword.unique()

In [16]:
keywords

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [17]:
keywords=[x.replace('%20',' ') for x in keywords[1:]]

In [18]:
keywords = set([lemmatizer.lemmatize(x) for x in keywords])

In [19]:
keywords

{'ablaze',
 'accident',
 'aftershock',
 'airplane accident',
 'ambulance',
 'annihilated',
 'annihilation',
 'apocalypse',
 'armageddon',
 'army',
 'arson',
 'arsonist',
 'attack',
 'attacked',
 'avalanche',
 'battle',
 'bioterror',
 'bioterrorism',
 'blaze',
 'blazing',
 'bleeding',
 'blew up',
 'blight',
 'blizzard',
 'blood',
 'bloody',
 'blown up',
 'body bag',
 'body bagging',
 'body bags',
 'bomb',
 'bombed',
 'bombing',
 'bridge collapse',
 'buildings burning',
 'buildings on fire',
 'burned',
 'burning',
 'burning buildings',
 'bush fires',
 'casualty',
 'catastrophe',
 'catastrophic',
 'chemical emergency',
 'cliff fall',
 'collapse',
 'collapsed',
 'collide',
 'collided',
 'collision',
 'crash',
 'crashed',
 'crush',
 'crushed',
 'curfew',
 'cyclone',
 'damage',
 'danger',
 'dead',
 'death',
 'debris',
 'deluge',
 'deluged',
 'demolish',
 'demolished',
 'demolition',
 'derail',
 'derailed',
 'derailment',
 'desolate',
 'desolation',
 'destroy',
 'destroyed',
 'destruction',
 

In [20]:
textprocessor2 = TextCounterTransformer()
text_processed = textprocessor2.fit_transform(tweets.text)

In [21]:
text_processed

array([Counter({'deed': 1, 'reason': 1, 'earthquak': 1, 'may': 1, 'allah': 1, 'forgiv': 1, 'us': 1}),
       Counter({'forest': 1, 'fire': 1, 'near': 1, 'la': 1, 'rong': 1, 'sask': 1, 'canada': 1}),
       Counter({'shelter': 2, 'place': 2, 'resid': 1, 'ask': 1, 'notifi': 1, 'offic': 1, 'evacu': 1, 'order': 1, 'expect': 1}),
       ...,
       Counter({'utc': 1, 'km': 1, 'volcano': 1, 'hawaii': 1, 'http': 1, 'co': 1, 'zdtoyd': 1, 'ebj': 1}),
       Counter({'e': 2, 'bike': 2, 'polic': 1, 'investig': 1, 'collid': 1, 'car': 1, 'littl': 1, 'portug': 1, 'rider': 1, 'suffer': 1, 'seriou': 1, 'non': 1, 'life': 1, 'threaten': 1, 'injuri': 1}),
       Counter({'latest': 1, 'home': 1, 'raze': 1, 'northern': 1, 'california': 1, 'wildfir': 1, 'abc': 1, 'news': 1, 'http': 1, 'co': 1, 'ymi': 1, 'rskq': 1})],
      dtype=object)

In [22]:
from scipy.sparse import csr_matrix

class TextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,vocabulary_size=2000):
        self.vocabulary_size = vocabulary_size
        
    def fit(self,X ,y=None):
        words_count = Counter()
        for word_count in X:
            for word,count in word_count.items():
                words_count[word]+=min(count,10)
        most_common = words_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [23]:
texttransformer = TextTransformer()
text_vec = texttransformer.fit_transform(text_processed)

In [24]:
text_vec

<7613x2001 sparse matrix of type '<class 'numpy.int64'>'
	with 65235 stored elements in Compressed Sparse Row format>

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
score = cross_val_score(log_clf, text_vec, tweets.target, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] .................................... , score=0.708, total=   0.2s
[CV]  ................................................................
[CV] .................................... , score=0.670, total=   0.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] .................................... , score=0.708, total=   0.2s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.6s finished


0.6952597226920226

In [26]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [27]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("text_counter", textprocessor2),
    ("wordcount_to_vector", texttransformer),
])

test_transformed = preprocess_pipeline.transform(test.text)

In [28]:
log_clf.fit(text_vec,tweets.target)

LogisticRegression(max_iter=1000, random_state=42)

In [29]:
predictions = log_clf.predict(test_transformed)

In [30]:
output = pd.DataFrame({'id': (test.id).astype(int),
                       'target': predictions})
output.to_csv('submission.csv', index=False)