## Imports

In [1]:
from IPython.display import display
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

import pandas as pd
import numpy as np
import re
import html
import string
from collections import Counter
from nltk import word_tokenize, FreqDist, bigrams
from nltk.corpus import stopwords
import category_encoders as ce
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from skopt import BayesSearchCV

# import nltk
# nltk.download('stopwords')
# nltk.download('punkt_tab')

from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# from fuzzywuzzy import process
# from sklearn.preprocessing import MultiLabelBinarizer

# Data

In [2]:
df_train_raw = pd.read_csv('train.csv')
df_test_raw = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [3]:
df_train = df_train_raw.copy()
df_test = df_test_raw.copy()

In [4]:
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [5]:
df_train.shape


(7613, 5)

In [6]:
pd.DataFrame(df_train.isnull().sum())


Unnamed: 0,0
id,0
keyword,61
location,2533
text,0
target,0


In [7]:
df_train.duplicated().sum()


np.int64(0)

In [8]:
print(f'Disasters:\t{df_train[df_train.target==1].shape[0]} ({round(df_train[df_train.target==1].shape[0]/df_train.shape[0]*100,1)}%)')
print(f'Not disasters:\t{df_train[df_train.target==0].shape[0]} ({round(df_train[df_train.target==0].shape[0]/df_train.shape[0]*100,1)}%)')

Disasters:	3271 (43.0%)
Not disasters:	4342 (57.0%)


# Keywords

In [9]:
df_train['keyword'].nunique()


221

In [10]:
pd.DataFrame(df_train[df_train['target']==1][['keyword','target']].groupby('keyword').value_counts().sort_values(ascending=False).head(10))


Unnamed: 0_level_0,Unnamed: 1_level_0,count
keyword,target,Unnamed: 2_level_1
derailment,1,39
outbreak,1,39
wreckage,1,39
debris,1,37
oil%20spill,1,37
typhoon,1,37
rescuers,1,32
suicide%20bomb,1,32
suicide%20bombing,1,32
evacuated,1,32


In [11]:
pd.DataFrame(df_train[df_train['target']==0][['keyword','target']].groupby('keyword').value_counts().sort_values(ascending=False).head(10))

Unnamed: 0_level_0,Unnamed: 1_level_0,count
keyword,target,Unnamed: 2_level_1
body%20bags,0,40
armageddon,0,37
harm,0,37
deluge,0,36
ruin,0,36
wrecked,0,36
explode,0,35
fear,0,35
twister,0,35
siren,0,35


In [12]:
df_train.groupby('keyword', as_index=False).agg(target_mean=('target','mean'), keyword_count=('target','size')).query('target_mean > 0.95').sort_values('target_mean', ascending=False).round(3)

Unnamed: 0,keyword,target_mean,keyword_count
62,debris,1.0,37
70,derailment,1.0,39
219,wreckage,1.0,39
153,outbreak,0.975,40
152,oil%20spill,0.974,38
205,typhoon,0.974,38
187,suicide%20bombing,0.97,33
186,suicide%20bomber,0.968,31


In [13]:
df_train.groupby('keyword', as_index=False).agg(target_mean=('target','mean'), keyword_count=('target','size')).query('target_mean <0.05').sort_values('target_mean', ascending=True).round(3)

Unnamed: 0,keyword,target_mean,keyword_count
2,aftershock,0.0,34
29,body%20bags,0.024,41
170,ruin,0.027,37
19,blazing,0.029,34
27,body%20bag,0.03,33
88,electrocute,0.031,32


# Locations

In [14]:
df_train['location'].nunique()


3341

In [15]:
pd.DataFrame(df_train[['location','target']].groupby('location').value_counts())
# grouped_counts = df_train[['location','target']].groupby('location').value_counts()
# grouped_counts[grouped_counts > 10].index.get_level_values('location').unique().tolist()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
location,target,Unnamed: 2_level_1
,1,1
Glasgow,0,1
"Melbourne, Australia",0,1
News,1,1
å_,0,1
...,...,...
å_: ?? ÌÑ ? : ?,0,1
å_å_Los Mina Cityã¢,0,1
å¡å¡Midwest Û¢Û¢,0,1
åÊ(?Û¢`?Û¢å«)??,0,1


In [16]:
pd.DataFrame(df_train[df_train['target']==1][['location','target']].groupby('location').value_counts().sort_values(ascending=False).head(10))


Unnamed: 0_level_0,Unnamed: 1_level_0,count
location,target,Unnamed: 2_level_1
USA,1,67
United States,1,27
Nigeria,1,22
India,1,20
Mumbai,1,19
UK,1,16
London,1,16
New York,1,16
"Washington, DC",1,15
Canada,1,13


In [17]:
pd.DataFrame(df_train[df_train['target']==0][['location','target']].groupby('location').value_counts().sort_values(ascending=False).head(10))

Unnamed: 0_level_0,Unnamed: 1_level_0,count
location,target,Unnamed: 2_level_1
New York,0,55
USA,0,37
London,0,29
United States,0,23
"Los Angeles, CA",0,18
Canada,0,16
Kenya,0,15
Everywhere,0,12
UK,0,11
Florida,0,11


In [18]:
df_train.groupby('location', as_index=False).agg(target_mean=('target','mean'), keyword_count=('target','size')).query('target_mean > 0.95').sort_values('target_mean', ascending=False).round(3)

Unnamed: 0,location,target_mean,keyword_count
3340,åø\_(?)_/åø,1.0,1
0,,1.0,1
3,News,1.0,1
6,"616 Û¢ Kentwood , MI",1.0,1
7,? ??????? ? ( ?? å¡ ? ? ? å¡),1.0,1
...,...,...,...
65,#partsunknown,1.0,1
63,#keepthefaith J&J,1.0,1
62,#iminchina,1.0,1
61,#goingdownthetoilet Illinois,1.0,1


In [19]:
df_train.groupby('location', as_index=False).agg(target_mean=('target','mean'), keyword_count=('target','size')).query('target_mean < 0.05').sort_values('target_mean', ascending=False).round(3)

Unnamed: 0,location,target_mean,keyword_count
3339,åÊ(?Û¢`?Û¢å«)??,0.0,1
1,Glasgow,0.0,1
2,"Melbourne, Australia",0.0,1
4,å_,0.0,1
3315,Û¢OlderCandyBloomÛ¢,0.0,1
...,...,...,...
15,"Eugene, Oregon",0.0,1
13,Blood Indian Reserve,0.0,1
11,"BC, US, Asia or Europe.",0.0,1
10,Alex/Mika/Leo|18|he/she/they,0.0,1


In [20]:
df_train.groupby('location', as_index=False)['target'].mean().query('target > 0.75 & target < 1.0').sort_values('target', ascending=False)
# ['location'].unique().tolist()

Unnamed: 0,location,target
1719,Mumbai,0.863636
1262,India,0.833333
1934,"Oklahoma City, OK",0.833333
2028,"Paterson, New Jersey",0.833333
2032,Pedophile hunting ground,0.833333
1426,"Lagos, Nigeria",0.8
1636,"Melbourne, Australia",0.8
2124,Puerto Rico,0.8
2538,The Netherlands,0.8
2866,WorldWide,0.8


In [21]:
df_train.groupby('location', as_index=False)['target'].mean().query('target > 0.0 & target < 0.25').sort_values('target', ascending=True)
# ['location'].unique().tolist()

Unnamed: 0,location,target
1511,"London, England",0.1
3222,ss,0.1
578,"California, United States",0.166667
27,Road to the Billionaires Club,0.166667
1759,NYC,0.166667
525,"Brooklyn, NY",0.166667
495,Brasil,0.2
484,"Boston, MA",0.2
2255,"San Jose, CA",0.2
934,Everywhere,0.2


In [22]:
df_train.groupby('location', as_index=False).agg(target_mean=('target','mean'), keyword_count=('target','size')).query('keyword_count >= 5').sort_values('keyword_count', ascending=False).sort_values('target_mean', ascending=False).head(5).round(3)

Unnamed: 0,location,target_mean,keyword_count
1719,Mumbai,0.864,22
1262,India,0.833,24
2028,"Paterson, New Jersey",0.833,6
1934,"Oklahoma City, OK",0.833,6
2032,Pedophile hunting ground,0.833,6


In [23]:
df_train.groupby('location', as_index=False).agg(target_mean=('target','mean'), keyword_count=('target','size')).query('keyword_count >= 5').sort_values('keyword_count', ascending=False).sort_values('target_mean', ascending=True).head(5).round(3)

Unnamed: 0,location,target_mean,keyword_count
1708,"Morioh, Japan",0.0,6
2039,"Pennsylvania, USA",0.0,7
114,304,0.0,9
2851,Wisconsin,0.0,5
2503,"Texas, USA",0.0,5


In [24]:
# df_train['cleaned_location'] = df_train['location']
# df_train['cleaned_location'] = df_train['cleaned_location'].fillna('unknown')

# standard_locations = ['Canada','Florida','India','Kenya','London','Los Angeles, CA','Mumbai','New York','Nigeria','UK','USA','United States','Washington, DC','Oklahoma City, OK','Paterson, New Jersey','Lagos, Nigeria','Melbourne, Australia','Puerto Rico','The Netherlands','Nashville, TN','London, England','California, United States','NYC','Brooklyn, NY','Brasil','Boston, MA','San Jose, CA','New York, USA','New Jersey','Vancouver, BC','Manchester']

# def generate_location_mapping(train_locations, standard_locations):
#     location_mapping = {}
#     for loc in train_locations:
#         match, score = process.extractOne(loc, standard_locations)
#         location_mapping[loc] = match if score > 90 else loc
#     return location_mapping

# unique_train_locations = df_train['cleaned_location'].unique()
# location_mapping = generate_location_mapping(unique_train_locations, standard_locations)

# def clean_locations(location, mapping):
#     return mapping.get(location, location)

# df_train['cleaned_location'] = df_train['cleaned_location'].apply(lambda x: clean_locations(x, location_mapping))

# df_train[(df_train['cleaned_location'] != df_train['location']) & (df_train['cleaned_location'] != 'unknown')][['location','cleaned_location']]

# Text

In [25]:
nltkstopwords = stopwords.words('english')

### Clean and extract

In [26]:
def remove_newlines(text): return re.sub(r'\n', ' ', text).strip()

def fix_html_entities(text): return html.unescape(text)

def extract_elements(text, element_type):
    patterns = {  'hashtags': r'#[A-Za-z0-9-_]+',
                  'handles': r'@[A-Za-z0-9-_]+',
                  'urls': r'https?://t.co/[A-Za-z0-9]{10}'  }
    pattern = re.compile(patterns[element_type])
    elements = pattern.findall(text)
    n = len(elements)
    elements_str = ' '.join(elements)
    new_text = pattern.sub('', text)
    return new_text.strip(), elements_str, n

# def extract_hashtags(text):
#     pattern = re.compile(r'#[A-Za-z0-9-_]+')
#     hashtags = pattern.findall(text)
#     n = len(hashtags)
#     hashtags_str = ' '.join(hashtags)
#     new_text = pattern.sub('', text)
#     return new_text.strip(), hashtags_str, n

# def extract_handles(text):
#     pattern = re.compile(r'@[A-Za-z0-9-_]+')
#     handles = pattern.findall(text)
#     n = len(handles)
#     handles_str = ' '.join(handles)
#     new_text = pattern.sub('', text)
#     return new_text.strip(), handles_str, n

# def extract_urls(text):
#     pattern = re.compile(r'https?://t.co/[A-Za-z0-9]{10}')
#     urls = pattern.findall(text)
#     n = len(urls)
#     urls_str = ' '.join(urls)
#     new_text = pattern.sub('', text)
#     return new_text.strip(), urls_str, n

In [27]:
df_train['text_clean'] = df_train['text'].apply(lambda x: remove_newlines(x))
df_train['text_clean'] = df_train['text_clean'].apply(lambda x: fix_html_entities(x))
df_train[['text_clean', 'hashtags', 'n_hashtags']] = df_train['text_clean'].apply(lambda x: extract_elements(x,'hashtags')).apply(pd.Series)
df_train[['text_clean', 'handles', 'n_handles']] = df_train['text_clean'].apply(lambda x: extract_elements(x,'handles')).apply(pd.Series)
df_train[['text_clean', 'urls', 'n_urls']] = df_train['text_clean'].apply(lambda x: extract_elements(x,'urls')).apply(pd.Series)

In [28]:
df_train[(df_train['text_clean'] != df_train['text']) & (df_train['n_hashtags'] >= 2) & (df_train['n_handles'] >= 1) & (df_train['n_urls'] >= 1)][['text','text_clean','hashtags','n_hashtags','handles','n_handles','urls','n_urls']]

Unnamed: 0,text,text_clean,hashtags,n_hashtags,handles,n_handles,urls,n_urls
104,320 [IR] ICEMOON [AFTERSHOCK] | http://t.co/yN...,320 [IR] ICEMOON [AFTERSHOCK] | | | Û_,#Dubstep #TrapMusic #DnB #EDM #Dance #Ices,6,@djicemoon,1,http://t.co/yNXnvVKCDA http://t.co/weQPesENku,2
106,320 [IR] ICEMOON [AFTERSHOCK] | http://t.co/vA...,320 [IR] ICEMOON [AFTERSHOCK] | | | Û_,#Dubstep #TrapMusic #DnB #EDM #Dance #Ices,6,@djicemoon,1,http://t.co/vAM5POdGyw http://t.co/zEVakJaPcz,2
114,320 [IR] ICEMOON [AFTERSHOCK] | http://t.co/M4...,320 [IR] ICEMOON [AFTERSHOCK] | | | Û_,#Dubstep #TrapMusic #DnB #EDM #Dance #Ices,6,@djicemoon,1,http://t.co/M4JDZMGJoW http://t.co/n0uhAsfkBv,2
115,320 [IR] ICEMOON [AFTERSHOCK] | http://t.co/vA...,320 [IR] ICEMOON [AFTERSHOCK] | | | Û_,#Dubstep #TrapMusic #DnB #EDM #Dance #Ices,6,@djicemoon,1,http://t.co/vAM5POdGyw http://t.co/zEVakJaPcz,2
116,320 [IR] ICEMOON [AFTERSHOCK] | http://t.co/e1...,320 [IR] ICEMOON [AFTERSHOCK] | | | Û_,#Dubstep #TrapMusic #DnB #EDM #Dance #Ices,6,@djicemoon,1,http://t.co/e14EPzhotH http://t.co/22a9D5DO6q,2
...,...,...,...,...,...,...,...,...
6993,This is my jam: Riser by Dierks Bentley @1061T...,This is my jam: Riser by Dierks Bentley ?,#iHeartRadio #NowPlaying,2,@1061TheTwister,1,http://t.co/zQoScQD64h http://t.co/yLvVF139BB,2
7022,#breaking #news Global precipitation measureme...,Global precipitation measurement satellite cap...,#breaking #news,2,@NASAHurricane,1,http://t.co/20DNcthr4D,1
7279,Whirlwind Head Scissor on @alexhammerstone @kt...,Whirlwind Head Scissor on ktfounder Û_,#RemyMarcel #FroFroFro,2,@alexhammerstone @kttape,2,https://t.co/B19z8Vi3td,1
7406,Twilight's Encore (Wounded Hearts Book 3) by J...,Twilight's Encore (Wounded Hearts Book 3) by J...,#KindleCountdown #Sale #MFRWauthor #MGTAB,4,@amazon,1,http://t.co/ZnpTdIcQxE,1


In [29]:
# for element in ['hashtags', 'handles', 'urls']:
#     mlb = MultiLabelBinarizer()
#     one_hot = pd.DataFrame(mlb.fit_transform(df_train[element]), columns=mlb.classes_, index=df_train.index)
#     # df_train = pd.concat([df_train, one_hot], axis=1)
#     display(one_hot)

### Stats

In [30]:
def char_count(text): return len(text)

def word_count(text): return len(text.split())

def unique_word_count(text): return len(set(text.split()))

def avg_word_length(text): return round(sum(len(word) for word in text.split()) / len(text.split()),3)

def punctuation_count(text): return len([n for n in text if n in string.punctuation])

def stopwords_count(text): return len([n for n in text if n in nltkstopwords])

def caps_count(text): return sum([1 for n in text if n.isupper()])

def repeated_words(text):
    word_counts = Counter(text.split())
    return ' '.join([word for word, count in word_counts.items() if count > 1 and word.lower() not in nltkstopwords])

In [31]:
df_train['char_count'] = df_train['text_clean'].apply(lambda x: char_count(x))
df_train['word_count'] = df_train['text_clean'].apply(lambda x: word_count(x))
df_train['unique_word_count'] = df_train['text_clean'].apply(lambda x: unique_word_count(x))
df_train['avg_word_length'] = df_train['text_clean'].apply(lambda x: avg_word_length(x))
df_train['punctuation_count'] = df_train['text_clean'].apply(lambda x: punctuation_count(x))
df_train['stopwords_count'] = df_train['text_clean'].apply(lambda x: stopwords_count(x))
df_train['caps_count'] = df_train['text_clean'].apply(lambda x: caps_count(x))
df_train['repeated_words'] = df_train['text_clean'].apply(lambda x: repeated_words(x))

## Poly features

In [32]:
def poly_features(df, poly=None):
    cols = ['n_handles','n_hashtags','n_urls','char_count','word_count','unique_word_count','avg_word_length','punctuation_count','stopwords_count','caps_count']
    numerical_features = df[cols]
    if poly is None:
        poly = PolynomialFeatures(degree=2, include_bias=False)
        poly.fit(numerical_features)
    poly_features = poly.transform(numerical_features)
    poly_feature_names = poly.get_feature_names_out(numerical_features.columns)
    df_poly = pd.DataFrame(poly_features, columns=poly_feature_names, index=df.index)
    return pd.concat([df, df_poly], axis=1), poly

df_train, poly = poly_features(df_train)

In [33]:
df_train.select_dtypes(include=['number']).drop('id', axis=1).corr()['target'].drop('target').sort_values(ascending=False).round(3)

n_urls stopwords_count         0.254
n_urls char_count              0.228
n_urls avg_word_length         0.212
n_urls unique_word_count       0.205
n_urls word_count              0.204
                               ...  
n_handles unique_word_count   -0.082
n_handles avg_word_length     -0.085
n_handles punctuation_count   -0.086
n_handles                     -0.103
n_handles                     -0.103
Name: target, Length: 75, dtype: float64

## Frequencies

In [34]:
full_clean_text = ' '.join(df_train['text_clean']).lower()
disaster_clean_text = ' '.join(df_train[df_train['target']==1]['text_clean']).lower()
notdisaster_clean_text = ' '.join(df_train[df_train['target']==0]['text_clean']).lower()

### Unigrams

In [35]:
all_tokens = [w for w in word_tokenize(full_clean_text) if (w not in nltkstopwords) & (w.isalpha())]
disaster_tokens = [w for w in word_tokenize(disaster_clean_text) if (w not in nltkstopwords) & (w.isalpha())]
notdisaster_tokens = [w for w in word_tokenize(notdisaster_clean_text) if (w not in nltkstopwords) & (w.isalpha())]

top_disaster_tokens = FreqDist(disaster_tokens).most_common(20)
top_notdisaster_tokens = FreqDist(notdisaster_tokens).most_common(20)
display(top_disaster_tokens)
display(top_notdisaster_tokens)

top_disaster_words = [w for w,f in top_notdisaster_tokens]
top_nondisaster_words = [w for w,f in top_disaster_tokens]
display(' '.join([w for w in top_disaster_words if w not in top_nondisaster_words]))
display(' '.join([w for w in top_nondisaster_words if w not in top_disaster_words]))

[('fire', 175),
 ('via', 121),
 ('disaster', 111),
 ('california', 107),
 ('police', 106),
 ('suicide', 104),
 ('people', 103),
 ('like', 93),
 ('killed', 92),
 ('storm', 85),
 ('crash', 83),
 ('news', 83),
 ('fires', 83),
 ('families', 81),
 ('train', 79),
 ('buildings', 75),
 ('bomb', 74),
 ('two', 72),
 ('emergency', 71),
 ('attack', 69)]

[('like', 253),
 ('new', 168),
 ('get', 162),
 ('one', 129),
 ('body', 110),
 ('would', 105),
 ('via', 97),
 ('video', 94),
 ('got', 92),
 ('people', 92),
 ('love', 86),
 ('know', 85),
 ('back', 83),
 ('time', 82),
 ('see', 82),
 ('full', 81),
 ('emergency', 79),
 ('day', 78),
 ('going', 75),
 ('ca', 74)]

'new get one body would video got love know back time see full day going ca'

'fire disaster california police suicide killed storm crash news fires families train buildings bomb two attack'

### Bigrams

In [36]:
all_bigrams = [' '.join(b) for b in list(bigrams(all_tokens))]
disaster_bigrams = [' '.join(b) for b in list(bigrams(disaster_tokens))]
nondisaster_bigrams = [' '.join(b) for b in list(bigrams(notdisaster_tokens))]

top_disaster_bigrams = FreqDist(disaster_bigrams).most_common(20)
top_nondisaster_bigrams = FreqDist(nondisaster_bigrams).most_common(20)
display(top_disaster_bigrams)
display(top_nondisaster_bigrams)

top_disaster_bigrams = [w for w,f in top_disaster_bigrams]
top_nondisaster_bigrams = [w for w,f in top_nondisaster_bigrams]
display(' | '.join([w for w in top_disaster_bigrams if w not in top_nondisaster_bigrams]))
display(' | '.join([w for w in top_nondisaster_bigrams if w not in top_disaster_bigrams]))

[('suicide bomber', 59),
 ('northern california', 41),
 ('oil spill', 38),
 ('burning buildings', 35),
 ('suicide bombing', 32),
 ('california wildfire', 32),
 ('bomber detonated', 30),
 ('homes razed', 29),
 ('latest homes', 28),
 ('razed northern', 28),
 ('pkk suicide', 28),
 ('detonated bomb', 28),
 ('old pkk', 27),
 ('debris found', 26),
 ('mass murder', 26),
 ('families sue', 26),
 ('sue legionnaires', 26),
 ('legionnaires families', 26),
 ('families affected', 26),
 ('affected fatal', 26)]

[('cross body', 38),
 ('liked video', 34),
 ('gon na', 32),
 ('wan na', 30),
 ('body bag', 26),
 ('body bagging', 23),
 ('burning buildings', 23),
 ('full read', 22),
 ('looks like', 21),
 ('feel like', 20),
 ('content policy', 20),
 ('body bags', 19),
 ('loud bang', 19),
 ('reddit quarantine', 19),
 ('quarantine offensive', 19),
 ('offensive content', 18),
 ('pick fan', 17),
 ('fan army', 17),
 ('fall cliff', 16),
 ('first responders', 16)]

'suicide bomber | northern california | oil spill | suicide bombing | california wildfire | bomber detonated | homes razed | latest homes | razed northern | pkk suicide | detonated bomb | old pkk | debris found | mass murder | families sue | sue legionnaires | legionnaires families | families affected | affected fatal'

'cross body | liked video | gon na | wan na | body bag | body bagging | full read | looks like | feel like | content policy | body bags | loud bang | reddit quarantine | quarantine offensive | offensive content | pick fan | fan army | fall cliff | first responders'

### URLs

In [37]:
df_train[['target','urls']].groupby('urls').sum().sort_values('target',ascending=False).head(10)

Unnamed: 0_level_0,target
urls,Unnamed: 1_level_1
,1100
http://t.co/KSAwlYuX02,6
http://t.co/LvlH3W3aWO http://t.co/vIwXY1XDYK,4
https://t.co/rqWuoy1fm4,3
http://t.co/V3aZWOAmzK,3
http://t.co/cybKsXHF7d,3
http://t.co/cEdCUgEuWs,3
http://t.co/po19h8YCND,2
http://t.co/EYSVvzA7Qm,2
http://t.co/zDtoyd8EbJ,2


## Category Encoding: category_encoders

In [38]:
features = ['keyword', 'location']
# features = ['keyword', 'cleaned_location']
encoder = ce.TargetEncoder(cols=features)
encoder.fit(df_train[features],df_train['target'])

df_train = df_train.join(encoder.transform(df_train[features]).add_suffix('_target'))

## Feature Extraction: CountVectorizer

In [39]:
vec_urls = CountVectorizer(min_df=5, analyzer='word', token_pattern=r'https?://t.co/[A-Za-z0-9]{10}')
urls_vec = vec_urls.fit_transform(df_train['urls'])
X_train_urls = pd.DataFrame(urls_vec.toarray(), columns=vec_urls.get_feature_names_out())

vec_handles = CountVectorizer(min_df=5)
handles_vec = vec_handles.fit_transform(df_train['handles'])
X_train_handles = pd.DataFrame(handles_vec.toarray(), columns=vec_handles.get_feature_names_out())

vec_hashtags = CountVectorizer(min_df=5)
hashtags_vec = vec_hashtags.fit_transform(df_train['hashtags'])
X_train_hashtags = pd.DataFrame(hashtags_vec.toarray(), columns=vec_hashtags.get_feature_names_out())

In [40]:
(X_train_urls.transpose().dot(df_train['target']) / X_train_urls.sum(axis=0)).sort_values(ascending=False)

http://t.co/vvplfqv58p    1.000000
http://t.co/ksawlyux02    1.000000
http://t.co/cybksxhf7d    0.600000
http://t.co/encmhz6y34    0.166667
http://t.co/q2eblokeve    0.166667
http://t.co/qew4c5m1xd    0.000000
dtype: float64

In [41]:
(X_train_handles.transpose().dot(df_train['target']) / X_train_handles.sum(axis=0)).sort_values(ascending=False)

ap               1.000000
usagov           1.000000
foxnews          0.888889
potus            0.666667
viralspell       0.600000
usatoday         0.444444
change           0.444444
youngheroesid    0.400000
youtube          0.216867
towel            0.166667
stretcher        0.166667
emmerdale        0.125000
mikeparractor    0.000000
arianagrande     0.000000
justinbieber     0.000000
invalid          0.000000
djicemoon        0.000000
dtype: float64

In [42]:
(X_train_hashtags.transpose().dot(df_train['target']) / X_train_hashtags.sum(axis=0)).sort_values(ascending=False)

abstorm       1.0
africa        1.0
antioch       1.0
hailstorm     1.0
india         1.0
             ... 
military      0.0
technology    0.0
summerfate    0.0
soundcloud    0.0
trapmusic     0.0
Length: 106, dtype: float64

## Feature Extraction: TfidfVectorizer

In [43]:
vec_text = TfidfVectorizer(min_df=10, ngram_range=(1,10), stop_words='english') 
text_vec = vec_text.fit_transform(df_train['text_clean'])
X_train_text = pd.DataFrame(text_vec.toarray(), columns=vec_text.get_feature_names_out())

In [44]:
df_train = df_train.join(X_train_urls, rsuffix='_urls')
df_train = df_train.join(X_train_handles, rsuffix='_handles')
df_train = df_train.join(X_train_hashtags, rsuffix='_hashtags')
df_train = df_train.join(X_train_text, rsuffix='_text')

# Modelling

### Init

In [45]:
lr = LogisticRegression(random_state=42, solver='liblinear')

### Prep test data, X, y

In [46]:
# df_test['cleaned_location'] = df_test['location']
# df_test['cleaned_location'] = df_test['cleaned_location'].fillna('unknown')
# df_test['cleaned_location'] = df_test['cleaned_location'].apply(lambda x: clean_locations(x, location_mapping))

df_test['text_clean'] = df_test['text'].apply(lambda x: remove_newlines(x))
df_test['text_clean'] = df_test['text_clean'].apply(lambda x: fix_html_entities(x))
df_test[['text_clean', 'hashtags', 'n_hashtags']] = df_test['text_clean'].apply(lambda x: extract_elements(x,'hashtags')).apply(pd.Series)
df_test[['text_clean', 'handles', 'n_handles']] = df_test['text_clean'].apply(lambda x: extract_elements(x,'handles')).apply(pd.Series)
df_test[['text_clean', 'urls', 'n_urls']] = df_test['text_clean'].apply(lambda x: extract_elements(x,'urls')).apply(pd.Series)
df_test['char_count'] = df_test['text_clean'].apply(lambda x: char_count(x))
df_test['word_count'] = df_test['text_clean'].apply(lambda x: word_count(x))
df_test['unique_word_count'] = df_test['text_clean'].apply(lambda x: unique_word_count(x))
df_test['avg_word_length'] = df_test['text_clean'].apply(lambda x: avg_word_length(x))
df_test['punctuation_count'] = df_test['text_clean'].apply(lambda x: punctuation_count(x))
df_test['stopwords_count'] = df_test['text_clean'].apply(lambda x: stopwords_count(x))
df_test['caps_count'] = df_test['text_clean'].apply(lambda x: caps_count(x))
df_test['repeated_words'] = df_test['text_clean'].apply(lambda x: repeated_words(x))

df_test, _ = poly_features(df_test, poly=poly)

df_test = df_test.join(encoder.transform(df_test[features]).add_suffix('_target'))

urls_vec_test = vec_urls.transform(df_test['urls'])
X_test_urls = pd.DataFrame(urls_vec_test.toarray(), columns=vec_urls.get_feature_names_out())
handles_vec_test = vec_handles.transform(df_test['handles'])
X_test_handles = pd.DataFrame(handles_vec_test.toarray(), columns=vec_handles.get_feature_names_out())
hashtags_vec_test = vec_hashtags.transform(df_test['hashtags'])
X_test_hashtags = pd.DataFrame(hashtags_vec_test.toarray(), columns=vec_hashtags.get_feature_names_out())
text_vec_test = vec_text.transform(df_test['text_clean'])
X_test_text = pd.DataFrame(text_vec_test.toarray(), columns=vec_text.get_feature_names_out())

df_test = df_test.join(X_test_urls, rsuffix='_urls')
df_test = df_test.join(X_test_handles, rsuffix='_handles')
df_test = df_test.join(X_test_hashtags, rsuffix='_hashtags')
df_test = df_test.join(X_test_text, rsuffix='_text')

In [47]:
features_to_drop = df_train.select_dtypes(exclude=['number']).columns.to_list()
features_to_drop.extend(['id'])

In [48]:
X_train = df_train.drop(columns=features_to_drop+['target'])
X_test = df_test.drop(columns=features_to_drop)
y_train = df_train['target']

In [49]:
lr.fit(X_train, y_train)
y_test = lr.predict(X_test)
print(f'Training f-1 score:\t\t{round(f1_score(y_train, lr.predict(X_train)),4)}')
print(f'Cross-validated f-1 score:\t{round(cross_val_score(lr, X_train, y_train, cv=5, scoring='f1').mean(),4)}')

Training f-1 score:		0.6844
Cross-validated f-1 score:	0.6743


### SMOTE (pre-scale)

In [50]:
print(f'Positives: {df_train[df_train.target==1].shape[0]} ({round(df_train[df_train.target==1].shape[0]/df_train.shape[0]*100,1)}%)')
print(f'Negatives: {df_train[df_train.target==0].shape[0]} ({round(df_train[df_train.target==0].shape[0]/df_train.shape[0]*100,1)}%)')
print(f'X number of rows: {X_train.shape[0]}')
print()

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print(f'Positives: {int(y_train[y_train==1].count())} ({round(int(y_train[y_train==1].count())/int(y_train.count())*100,1)}%)')
print(f'Negatives: {int(y_train[y_train==0].count())} ({round(int(y_train[y_train==0].count())/int(y_train.count())*100,1)}%)')
print(f'X number of rows: {y_train.shape[0]}')

Positives: 3271 (43.0%)
Negatives: 4342 (57.0%)
X number of rows: 7613

Positives: 4342 (50.0%)
Negatives: 4342 (50.0%)
X number of rows: 8684


In [51]:
# lr.fit(X_train, y_train)
# y_test = lr.predict(X_test)
# print(f'Training f-1 score:\t\t{round(f1_score(y_train, lr.predict(X_train)),4)}')
# print(f'Cross-validated f-1 score:\t{round(cross_val_score(lr, X_train, y_train, cv=5, scoring='f1').mean(),4)}')

Training f-1 score:		0.7427
Cross-validated f-1 score:	0.732


### Scale

In [52]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [53]:
# lr.fit(X_train_scaled, y_train)
# y_test = lr.predict(X_test_scaled)
# print(f'Training f-1 score:\t\t{round(f1_score(y_train, lr.predict(X_train_scaled)),4)}')
# print(f'Cross-validated f-1 score:\t{round(cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='f1').mean(),4)}')

### Logistic Regressor

In [54]:
# search_spaces = [{'solver':['liblinear'], 'penalty':['l1','l2'], 'C':(1e-4, 1e4, 'log-uniform')}]
# bayessearch_lr = BayesSearchCV(LogisticRegression(random_state=42), search_spaces=search_spaces, n_iter=100, scoring='f1', cv=5, n_jobs=-1)
# bayessearch_lr.fit(X_train_scaled, y_train)
# print("Best score:", bayessearch_lr.best_score_)
# print("Best parameters:", bayessearch_lr.best_params_)

In [55]:
# param_grid = [{'solver':['liblinear'], 'penalty':['l1','l2'], 'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}]
# gridsearch_lr = GridSearchCV(LogisticRegression(random_state=42, max_iter=100), param_grid=param_grid, scoring='f1', cv=5, n_jobs=-1)
# gridsearch_lr.fit(X_train_scaled, y_train)
# print("Best score:", gridsearch_lr.best_score_)
# print("Best parameters:", gridsearch_lr.best_params_)

In [56]:
# lr = bayessearch_lr.best_estimator_
lr = LogisticRegression(random_state=42, C=0.14421478790765738, penalty='l1', solver='liblinear')
# lr = gridsearch_lr.best_estimator_
# lr = LogisticRegression(random_state=42, C=0.1, penalty='l1', solver='liblinear')

In [57]:
# lr.fit(X_train_scaled, y_train)
# y_test = lr.predict(X_test_scaled)
# print(f'Training f-1 score:\t\t{round(f1_score(y_train, lr.predict(X_train_scaled)),4)}')
# print(f'Cross-validated f-1 score:\t{round(cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='f1').mean(),4)}')

### Feature Selection: Select K Best, Variance Threshold

In [59]:
# selector_pipeline = Pipeline([('select',SelectKBest(score_func=chi2)), ('clf',lr)])
# bayes_search_selector = BayesSearchCV(estimator=selector_pipeline, search_spaces={'select__k':(1,X_train_scaled.shape[1])}, n_iter=50, scoring='f1', cv=5, verbose=0, n_jobs=-1)
# bayes_search_selector.fit(X_train_scaled, y_train)
# print("Best k:", bayes_search_selector.best_params_['select__k'])
# print("Best F1 score:", bayes_search_selector.best_score_)
# selector = bayes_search_selector.best_estimator_[0]
selector = SelectKBest(score_func=chi2, k=500)
X_train_scaled = selector.fit_transform(X_train_scaled, y_train)
X_test_scaled = selector.transform(X_test_scaled)

In [60]:
# from sklearn.feature_selection import VarianceThreshold
# selector = VarianceThreshold(threshold=0.01)
# X_train = selector.fit_transform(X_train, y_train)
# X_test = selector.transform(X_test)

In [61]:
# lr.fit(X_train_scaled, y_train)
# y_test = lr.predict(X_test_scaled)
# print(f'Training f-1 score:\t\t{round(f1_score(y_train, lr.predict(X_train_scaled)),4)}')
# print(f'Cross-validated f-1 score:\t{round(cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='f1').mean(),4)}')

### Feature Selection: RFECV

In [64]:
rfecv = RFECV(estimator=lr, step=1000, cv=2, scoring='f1')
rfecv.fit(X_train_scaled, y_train)

# plt.figure(figsize=(12,6))
# plt.xlabel("Number of features selected")
# plt.ylabel("Number of correct classifications)")
# plt.plot(rfecv.cv_results_['n_features'], rfecv.cv_results_['mean_test_score'])
# plt.show()

print("Optimal number of features:", rfecv.n_features_)
# rfecv_features = rfecv.support_
# print("Selected features:", rfecv_features)
# print("Selected features:", X_train.columns[rfecv_features])
# # print("Feature rankings:", rfecv.ranking_)

X_train_scaled = rfecv.transform(X_train_scaled)
X_test_scaled = rfecv.transform(X_test_scaled)

Optimal number of features: 500


In [65]:
# lr.fit(X_train_scaled, y_train)
# y_test = lr.predict(X_test_scaled)
# print(f'Training f-1 score:\t\t{round(f1_score(y_train, lr.predict(X_train_scaled)),4)}')
# print(f'Cross-validated f-1 score:\t{round(cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='f1').mean(),4)}')

### Alternative: SMOTE (post-scale)

In [66]:
# print(f'Positives: {df_train[df_train.target==1].shape[0]} ({round(df_train[df_train.target==1].shape[0]/df_train.shape[0]*100,1)}%)')
# print(f'Negatives: {df_train[df_train.target==0].shape[0]} ({round(df_train[df_train.target==0].shape[0]/df_train.shape[0]*100,1)}%)')
# print(f'X number of rows: {X_train.shape[0]}')
# print()

# smote = SMOTE(random_state=42)
# X_train_scaled, y_train = smote.fit_resample(X_train_scaled, y_train)

# print(f'Positives: {int(y_train[y_train==1].count())} ({round(int(y_train[y_train==1].count())/int(y_train.count())*100,1)}%)')
# print(f'Negatives: {int(y_train[y_train==0].count())} ({round(int(y_train[y_train==0].count())/int(y_train.count())*100,1)}%)')
# print(f'X number of rows: {y_train.shape[0]}')

In [67]:
# lr.fit(X_train_scaled, y_train)
# y_test = lr.predict(X_test_scaled)
# print(f'Training f-1 score:\t\t{round(f1_score(y_train, lr.predict(X_train_scaled)),4)}')
# print(f'Cross-validated f-1 score:\t{round(cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='f1').mean(),4)}')

### Final Fit and Predict

In [68]:
lr.fit(X_train_scaled, y_train)
y_test = lr.predict(X_test_scaled)

In [69]:
print(f'Training f-1 score:\t{round(f1_score(y_train, lr.predict(X_train_scaled)),4)}')
print(f'Training accuracy:\t{round(lr.score(X_train_scaled, y_train),4)}')
# cm = confusion_matrix(y_train, lr.predict(X_train_scaled))
# display(pd.DataFrame(cm,index=['Actual Negative', 'Actual Positive'],columns=['Predicted Negative', 'Predicted Positive']))
# display(pd.DataFrame((cm/cm.sum()*100).round(1),index=['Actual Negative (%)', 'Actual Positive (%)'],columns=['Predicted Negative (%)', 'Predicted Positive (%)']))

Training f-1 score:	0.864
Training accuracy:	0.8641


In [70]:
# from sklearn.model_selection import StratifiedKFold
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print(f'Cross-validated f-1 score:\t{round(cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='f1').mean(),4)}')

Cross-validated f-1 score:	0.8613


- 0.8616 : smote scale lr (0.14421478790765738) rfecv (6!)
- 0.8608: smote scale lr (0.14421478790765738) skb (2544)
- 0.8607: smote scale lr (0.14421478790765738) skb (2464) rfecv (1014)
- 0.8598: scale lr (0.3059596085116492) skb (2310) rfecv (460) smote
- 0.8306 : defaults

## Pipeline

In [71]:
# pipeline = Pipeline([
#     ('smote', SMOTE(random_state=42)),
#     ('scaler', MinMaxScaler()),
#     ('feature_selection', SelectKBest(score_func=chi2)),
#     ('clf', LogisticRegression(random_state=42))
# ])

# param_grid = {
#     'feature_selection__k': (1, X_train.shape[1]),
#     'clf__solver': ['liblinear'],
#     'clf__penalty': ['l1', 'l2'],
#     'clf__C': (1e-4, 1e4, 'log-uniform')
# }

# bayes_search = BayesSearchCV(
#     estimator=pipeline,
#     search_spaces=param_grid,
#     n_iter=100,
#     scoring='f1',
#     cv=10,
#     n_jobs=-1,
#     verbose=0
# )

# bayes_search.fit(X_train, y_train)
# best_pipeline = bayes_search.best_estimator_

# rfecv = RFECV(estimator=best_pipeline.named_steps['clf'], step=5, cv=10, scoring='f1')
# X_train_rfecv = rfecv.fit_transform(best_pipeline[:-1].fit_transform(X_train, y_train), y_train)
# X_test_rfecv = rfecv.transform(best_pipeline[:-1].transform(X_test))

# best_pipeline.named_steps['clf'].fit(X_train_rfecv, y_train)
# y_test = best_pipeline.named_steps['clf'].predict(X_test_rfecv)
# print(f'Training f-1 score:\t\t{round(f1_score(y_train, lr.predict(X_train_scaled)),4)}')
# print(f'Cross-validated f-1 score:\t{round(cross_val_score(lr, X_train_scaled, y_train, cv=10, scoring='f1').mean(),4)}')

# Submission

In [72]:
# submission['target'] = y_test
# print(submission.shape)
# submission.to_csv('submission_jg_8613.csv', index=False)