In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('omw-1.4')
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


In [2]:
TRAIN_FP = '../input/nlp-getting-started/train.csv'
TEST_FP = '../input/nlp-getting-started/test.csv'

train = pd.read_csv(TRAIN_FP)
test = pd.read_csv(TEST_FP)

In [3]:
disaster_tweets = train.loc[train['target'] == 1, :].shape[0]
total_tweets = train.shape[0]
print(f'{disaster_tweets} disasters, {(disaster_tweets/total_tweets)*100:.2f}% of training set')

3271 disasters, 42.97% of training set


In [4]:
train_duplicates = train.loc[train.duplicated(subset='text', keep=False), :]
num_unique_duplicated_tweets = train_duplicates.loc(axis=1)['text'].unique().shape[0]
num_total_duplicated_tweets = train_duplicates.shape[0]

print(f'{num_unique_duplicated_tweets} tweets duplicated, {num_total_duplicated_tweets} total duplicates')

69 tweets duplicated, 179 total duplicates


In [5]:
# For the duplicated entries, if not marked the same => drop altogether
duplicated_tweets = train_duplicates.loc(axis=1)['text'].unique()
train_first_of_duplicate = pd.DataFrame(columns=train.columns)

for tweet in duplicated_tweets:
    if train_duplicates.loc(axis=0)[train_duplicates['text'] == tweet]['target'].unique().shape[0] == 1:
        train_first_of_duplicate = pd.concat([
            train_first_of_duplicate,
            pd.DataFrame(train_duplicates.loc(axis=0)[train_duplicates['text'] == tweet].iloc[0]).T
        ],
            ignore_index=True
        )

# All of training set with none of the duplicates
train_unduplicated = train.drop_duplicates(subset='text', keep=False)

# All of training set with first instance of each duplicate included
train_all_no_duplicates = pd.concat([
    train_first_of_duplicate,
    train_unduplicated     
],
    ignore_index=True
)

In [6]:
# Check no duplicates remain
remaining_duplicates = train_all_no_duplicates.loc[train_all_no_duplicates.duplicated(subset='text', keep=False), :]
print(f'Remaining duplicates in training set: {remaining_duplicates.shape[0]}')

Remaining duplicates in training set: 0


In [7]:
train_set = train_all_no_duplicates
test_set = test

### Pre-Processing

In [8]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess_text(sentence):
    sentence = str(sentence)
    sentence = sentence.lower()
    sentence = sentence.replace('{html}', "") 
    rem_url = re.sub(r'http\S+', '', sentence)
    cleanr = re.compile('[^A-Za-z ]+')
    rem_num_non_alpha = re.sub(cleanr, '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num_non_alpha)
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words = [stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(lemma_words)

def create_set(df, vocab=None):
    df['clean_text'] = df['text'].map(lambda s:preprocess_text(s))
    
    if vocab is None:
        vocab = df.loc(axis=1)['clean_text']
        
    vectorizer = TfidfVectorizer()
    vectorizer.fit(vocab)
    X_vec = vectorizer.transform(df.loc(axis=1)['clean_text'])
    X = pd.DataFrame(data=X_vec.toarray(), columns=vectorizer.get_feature_names_out())
    y = df.loc(axis=1)['target'].astype('int') if 'target' in df else None
    return X, y, vocab

In [9]:
X, y, train_vocab = create_set(train_set)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### Validation Model

In [10]:
model_val = LogisticRegression()
model_val.fit(X_train, y_train)
y_val_pred = model_val.predict(X_val)

In [11]:
print('Validation accuracy: ',{accuracy_score(y_val, y_val_pred)})

Validation accuracy:  {0.7982631930527722}


### Test Model

In [12]:
X_train, y_train = X, y
X_test,_,_ = create_set(test, vocab=train_vocab)
test_ids = test.loc(axis=1)['id']

In [13]:
model_test = LogisticRegression()
model_test.fit(X_train, y_train)
y_test_pred = pd.DataFrame(data=test_ids, columns=['id'])
y_test_pred['target'] = model_test.predict(X_test)
y_test_pred.to_csv('./submission.csv', index=False)

In [14]:
y_test_pred.tail()

Unnamed: 0,id,target
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1
3262,10875,0
