## Imports

In [None]:
from IPython.display import display
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

import pandas as pd
import numpy as np
import re
import html
import string
from collections import Counter
from itertools import chain, combinations
from nltk import word_tokenize, FreqDist, bigrams
from nltk.corpus import stopwords
from category_encoders import TargetEncoder as ce_TargetEncoder
from sklearn.preprocessing import TargetEncoder as skl_TargetEncoder
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from skopt import BayesSearchCV

# import nltk
# nltk.download('stopwords')
# nltk.download('punkt_tab')

from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# from fuzzywuzzy import process
# from sklearn.preprocessing import MultiLabelBinarizer

# Data exploration and cleaning

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [None]:
df_train

In [None]:
df_train.shape

In [None]:
pd.DataFrame(df_train.isnull().sum())

In [None]:
df_train.duplicated().sum()

In [None]:
print(f'Disasters:\t{df_train[df_train.target==1].shape[0]} ({round(df_train[df_train.target==1].shape[0]/df_train.shape[0]*100,1)}%)')
print(f'Not disasters:\t{df_train[df_train.target==0].shape[0]} ({round(df_train[df_train.target==0].shape[0]/df_train.shape[0]*100,1)}%)')

## Keywords

In [None]:
df_train['keyword'].nunique()

In [None]:
pd.DataFrame(df_train[df_train['target']==1][['keyword','target']].groupby('keyword').value_counts().sort_values(ascending=False).head(10))

In [None]:
pd.DataFrame(df_train[df_train['target']==0][['keyword','target']].groupby('keyword').value_counts().sort_values(ascending=False).head(10))

In [None]:
df_train.groupby('keyword', as_index=False).agg(target_mean=('target','mean'), keyword_count=('target','size')).query('target_mean > 0.95').sort_values('target_mean', ascending=False).round(3)

In [None]:
df_train.groupby('keyword', as_index=False).agg(target_mean=('target','mean'), keyword_count=('target','size')).query('target_mean <0.05').sort_values('target_mean', ascending=True).round(3)

## Locations

In [None]:
df_train['location'].nunique()

In [None]:
pd.DataFrame(df_train[['location','target']].groupby('location').value_counts())
# grouped_counts = df_train[['location','target']].groupby('location').value_counts()
# grouped_counts[grouped_counts > 10].index.get_level_values('location').unique().tolist()

In [None]:
pd.DataFrame(df_train[df_train['target']==1][['location','target']].groupby('location').value_counts().sort_values(ascending=False).head(10))

In [None]:
pd.DataFrame(df_train[df_train['target']==0][['location','target']].groupby('location').value_counts().sort_values(ascending=False).head(10))

In [None]:
df_train.groupby('location', as_index=False).agg(target_mean=('target','mean'), keyword_count=('target','size')).query('target_mean > 0.95').sort_values('target_mean', ascending=False).round(3)

In [None]:
df_train.groupby('location', as_index=False).agg(target_mean=('target','mean'), keyword_count=('target','size')).query('target_mean < 0.05').sort_values('target_mean', ascending=False).round(3)

In [None]:
df_train.groupby('location', as_index=False)['target'].mean().query('target > 0.75 & target < 1.0').sort_values('target', ascending=False)
# ['location'].unique().tolist()

In [None]:
df_train.groupby('location', as_index=False)['target'].mean().query('target > 0.0 & target < 0.25').sort_values('target', ascending=True)
# ['location'].unique().tolist()

In [None]:
df_train.groupby('location', as_index=False).agg(target_mean=('target','mean'), keyword_count=('target','size')).query('keyword_count >= 5').sort_values(['target_mean','keyword_count'], ascending=[False,False]).head(5).round(3)

In [None]:
df_train.groupby('location', as_index=False).agg(target_mean=('target','mean'), keyword_count=('target','size')).query('keyword_count >= 5').sort_values(['target_mean','keyword_count'], ascending=[True,False]).head(5).round(3)

In [23]:
# df_train['cleaned_location'] = df_train['location']
# df_train['cleaned_location'] = df_train['cleaned_location'].fillna('unknown')

# standard_locations = ['Canada','Florida','India','Kenya','London','Los Angeles, CA','Mumbai','New York','Nigeria','UK','USA','United States','Washington, DC','Oklahoma City, OK','Paterson, New Jersey','Lagos, Nigeria','Melbourne, Australia','Puerto Rico','The Netherlands','Nashville, TN','London, England','California, United States','NYC','Brooklyn, NY','Brasil','Boston, MA','San Jose, CA','New York, USA','New Jersey','Vancouver, BC','Manchester']

# def generate_location_mapping(train_locations, standard_locations):
#     location_mapping = {}
#     for loc in train_locations:
#         match, score = process.extractOne(loc, standard_locations)
#         location_mapping[loc] = match if score > 90 else loc
#     return location_mapping

# unique_train_locations = df_train['cleaned_location'].unique()
# location_mapping = generate_location_mapping(unique_train_locations, standard_locations)

# def clean_locations(location, mapping):
#     return mapping.get(location, location)

# df_train['cleaned_location'] = df_train['cleaned_location'].apply(lambda x: clean_locations(x, location_mapping))

# df_train[(df_train['cleaned_location'] != df_train['location']) & (df_train['cleaned_location'] != 'unknown')][['location','cleaned_location']]

## Text

### Clean

In [24]:
def remove_newlines(text): return re.sub(r'\n', ' ', text).strip()

def fix_html_entities(text): return html.unescape(text)

def extract_elements(text, element_type):
    patterns = {  'hashtags': r'#[A-Za-z0-9-_]+',
                  'handles': r'@[A-Za-z0-9-_]+',
                  'urls': r'https?://t.co/[A-Za-z0-9]{10}'  }
    pattern = re.compile(patterns[element_type])
    elements = pattern.findall(text)
    n = len(elements)
    elements_str = ' '.join(elements).lower()
    new_text = pattern.sub('', text)
    return new_text.strip(), elements_str, n

In [25]:
df_train['text_clean'] = df_train['text'].apply(lambda x: remove_newlines(x))
df_train['text_clean'] = df_train['text_clean'].apply(lambda x: fix_html_entities(x))
df_train[['text_clean', 'hashtags', 'n_hashtags']] = df_train['text_clean'].apply(lambda x: extract_elements(x,'hashtags')).apply(pd.Series)
df_train[['text_clean', 'handles', 'n_handles']] = df_train['text_clean'].apply(lambda x: extract_elements(x,'handles')).apply(pd.Series)
df_train[['text_clean', 'urls', 'n_urls']] = df_train['text_clean'].apply(lambda x: extract_elements(x,'urls')).apply(pd.Series)

In [None]:
df_train[(df_train['text_clean'] != df_train['text']) & (df_train['n_hashtags'] >= 2) & (df_train['n_handles'] >= 1) & (df_train['n_urls'] >= 1)][['text','text_clean','hashtags','n_hashtags','handles','n_handles','urls','n_urls']]

In [27]:
# for element in ['hashtags', 'handles', 'urls']:
#     mlb = MultiLabelBinarizer()
#     one_hot = pd.DataFrame(mlb.fit_transform(df_train[element]), columns=mlb.classes_, index=df_train.index)
#     # df_train = pd.concat([df_train, one_hot], axis=1)
#     display(one_hot)

### Frequencies

In [None]:
len(set(' '.join(df_train['text_clean']).lower().split()))

In [29]:
disaster_clean_text = ' '.join(df_train[df_train['target']==1]['text_clean']).lower()
notdisaster_clean_text = ' '.join(df_train[df_train['target']==0]['text_clean']).lower()

### Unigrams

In [None]:
nltkstopwords = stopwords.words('english')

disaster_tokens = [w for w in word_tokenize(disaster_clean_text) if (w not in nltkstopwords) & (w.isalpha())]
notdisaster_tokens = [w for w in word_tokenize(notdisaster_clean_text) if (w not in nltkstopwords) & (w.isalpha())]

top_disaster_tokens = FreqDist(disaster_tokens).most_common(20)
top_notdisaster_tokens = FreqDist(notdisaster_tokens).most_common(20)
display(pd.DataFrame(top_disaster_tokens, columns=['Disaster Token', 'Frequency']).head(10))
display(pd.DataFrame(top_notdisaster_tokens, columns=['Non-Disaster Token', 'Frequency']).head(10))

top_disaster_words = [w for w,f in top_disaster_tokens]
top_nondisaster_words = [w for w,f in top_notdisaster_tokens]
display(', '.join([w for w in top_disaster_words if w not in top_nondisaster_words]))
display(', '.join([w for w in top_nondisaster_words if w not in top_disaster_words]))

### Bigrams

In [None]:
disaster_bigrams = [' '.join(b) for b in list(bigrams(disaster_tokens))]
nondisaster_bigrams = [' '.join(b) for b in list(bigrams(notdisaster_tokens))]

top_disaster_bigrams = FreqDist(disaster_bigrams).most_common(20)
top_nondisaster_bigrams = FreqDist(nondisaster_bigrams).most_common(20)
display(pd.DataFrame(top_disaster_bigrams, columns=['Disaster Token', 'Frequency']).head(10))
display(pd.DataFrame(top_nondisaster_bigrams, columns=['Non-Disaster Token', 'Frequency']).head(10))

top_disaster_bigrams = [w for w,f in top_disaster_bigrams]
top_nondisaster_bigrams = [w for w,f in top_nondisaster_bigrams]
display(' | '.join([w for w in top_disaster_bigrams if w not in top_nondisaster_bigrams]))
display(' | '.join([w for w in top_nondisaster_bigrams if w not in top_disaster_bigrams]))

### Hashtags

In [None]:
len(set(' '.join(df_train['hashtags']).split()))

In [None]:
pd.DataFrame(df_train[df_train['target']==1][['hashtags','target']].groupby('hashtags').value_counts().sort_values(ascending=False).head(10))

In [None]:
pd.DataFrame(df_train[df_train['target']==0][['hashtags','target']].groupby('hashtags').value_counts().sort_values(ascending=False).head(10))

### Handles

In [None]:
len(set(' '.join(df_train['handles']).split()))

In [None]:
pd.DataFrame(df_train[df_train['target']==1][['handles','target']].groupby('handles').value_counts().sort_values(ascending=False).head(10))

In [None]:
pd.DataFrame(df_train[df_train['target']==0][['handles','target']].groupby('handles').value_counts().sort_values(ascending=False).head(10))

### URLs

In [None]:
len(set(' '.join(df_train['urls']).split()))

In [None]:
pd.DataFrame(df_train[df_train['target']==1][['urls','target']].groupby('urls').value_counts().sort_values(ascending=False).head(10))

In [None]:
pd.DataFrame(df_train[df_train['target']==0][['urls','target']].groupby('urls').value_counts().sort_values(ascending=False).head(10))

# Feature engineering

## Stats

In [41]:
def char_count(text): return len(text)

def word_count(text): return len(text.split())

def unique_word_count(text): return len(set(text.split()))

def avg_word_length(text): return round(sum(len(word) for word in text.split()) / len(text.split()),3)

def punctuation_count(text): return len([n for n in text if n in string.punctuation])

def stopwords_count(text): return len([n for n in text.lower() if n in nltkstopwords])

def caps_count(text): return sum([1 for n in text if n.isupper()])

def repeated_words(text):
    word_counts = Counter(text.split())
    return ' '.join([word for word, count in word_counts.items() if count > 1 and word.lower() not in nltkstopwords])

In [42]:
df_train['char_count'] = df_train['text_clean'].apply(lambda x: char_count(x))
df_train['word_count'] = df_train['text_clean'].apply(lambda x: word_count(x))
df_train['unique_word_count'] = df_train['text_clean'].apply(lambda x: unique_word_count(x))
df_train['avg_word_length'] = df_train['text_clean'].apply(lambda x: avg_word_length(x))
df_train['punctuation_count'] = df_train['text_clean'].apply(lambda x: punctuation_count(x))
df_train['stopwords_count'] = df_train['text_clean'].apply(lambda x: stopwords_count(x))
df_train['caps_count'] = df_train['text_clean'].apply(lambda x: caps_count(x))
# df_train['repeated_words'] = df_train['text_clean'].apply(lambda x: repeated_words(x))

In [None]:
df_train[['char_count','word_count','unique_word_count','avg_word_length','punctuation_count','stopwords_count','caps_count']]

## Polynomial Features

In [44]:
def poly_features(df, poly=None):
    cols = ['n_handles','n_hashtags','n_urls','char_count','word_count','unique_word_count','avg_word_length','punctuation_count','stopwords_count','caps_count']
    numerical_features = df[cols]
    if poly is None:
        poly = PolynomialFeatures(degree=2, include_bias=False)
        poly.fit(numerical_features)
    poly_features = poly.transform(numerical_features)
    poly_feature_names = poly.get_feature_names_out(numerical_features.columns)
    df_poly = pd.DataFrame(poly_features, columns=poly_feature_names, index=df.index)
    df_poly = df_poly.loc[:, ~df_poly.columns.isin(numerical_features.columns)]
    return pd.concat([df, df_poly], axis=1), poly

df_train, poly = poly_features(df_train)

In [None]:
pd.DataFrame(df_train.iloc[0][['char_count^2','char_count word_count','char_count unique_word_count','char_count avg_word_length','char_count punctuation_count','char_count stopwords_count','char_count caps_count']]).T

In [None]:
pd.DataFrame(df_train.select_dtypes(include=['number']).drop('id', axis=1).corr()['target'].drop('target').sort_values(ascending=False).round(3))

## Category Encoding

In [47]:
features = ['keyword', 'location']
# features = ['keyword', 'cleaned_location']

In [48]:
ce_encoder = ce_TargetEncoder(cols=features)
ce_transformed_df = ce_encoder.fit_transform(df_train[features], df_train['target']).add_suffix('_target_ce')
df_train = df_train.join(ce_transformed_df)

In [49]:
skl_encoder = skl_TargetEncoder(categories='auto', target_type='binary', smooth='auto', cv=5, random_state=42)
skl_transformed = skl_encoder.fit_transform(df_train[features], df_train['target'])
skl_transformed_df = pd.DataFrame(skl_transformed, columns=[f"{col}_target_skl" for col in features], index=df_train.index)
df_train = df_train.join(skl_transformed_df)

In [None]:
pd.DataFrame(df_train[df_train['location']=='London'][['location','location_target_ce','location_target_skl']]).head(10)

In [None]:
pd.DataFrame(df_train[df_train['location']=='London'][['target','location_target_skl']].groupby('location_target_skl').value_counts()).sort_values(['location_target_skl','target'])

In [None]:
pd.DataFrame(df_train[df_train['location']=='London'][['target','location_target_skl']].groupby('target').mean())

## Feature Extraction: CountVectorizer

In [None]:
vec_hashtags = CountVectorizer(min_df=4)
df_train_hashtags_vectorised = vec_hashtags.fit_transform(df_train['hashtags'])
df_train_hashtags_vectorised_df = pd.DataFrame(df_train_hashtags_vectorised.toarray(), columns=vec_hashtags.get_feature_names_out())

vec_handles = CountVectorizer(min_df=2)
df_train_handles_vectorised = vec_handles.fit_transform(df_train['handles'])
df_train_handles_vectorised_df = pd.DataFrame(df_train_handles_vectorised.toarray(), columns=vec_handles.get_feature_names_out())

vec_urls = CountVectorizer(min_df=2, token_pattern=r'https?://t.co/[A-Za-z0-9]{10}')
df_train_urls_vectorised = vec_urls.fit_transform(df_train['urls'])
df_train_urls_vectorised_df = pd.DataFrame(df_train_urls_vectorised.toarray(), columns=vec_urls.get_feature_names_out())

print(f'{df_train_hashtags_vectorised_df.shape[1]} {df_train_handles_vectorised_df.shape[1]} {df_train_urls_vectorised_df.shape[1]}')

In [None]:
pd.DataFrame(df_train_hashtags_vectorised_df.transpose().dot(df_train['target']) / df_train_hashtags_vectorised_df.sum(axis=0)).sort_values(0, ascending=False)

In [None]:
pd.DataFrame(df_train_handles_vectorised_df.transpose().dot(df_train['target']) / df_train_handles_vectorised_df.sum(axis=0)).sort_values(0, ascending=False)

In [None]:
pd.DataFrame(df_train_urls_vectorised_df.transpose().dot(df_train['target']) / df_train_urls_vectorised_df.sum(axis=0)).sort_values(0, ascending=False)

In [57]:
df_train = df_train.join(df_train_hashtags_vectorised_df, rsuffix='_hashtags')
df_train = df_train.join(df_train_handles_vectorised_df, rsuffix='_handles')
df_train = df_train.join(df_train_urls_vectorised_df, rsuffix='_urls')

## Feature Extraction: TfidfVectorizer

In [None]:
df_train['word_count'].mean()

In [None]:
vec_text = TfidfVectorizer(min_df=10, ngram_range=(1,5), stop_words='english') 
df_train_text_clean_vectorised = vec_text.fit_transform(df_train['text_clean'])
df_train_text_clean_vectorised_df = pd.DataFrame(df_train_text_clean_vectorised.toarray(), columns=vec_text.get_feature_names_out())

print(df_train_text_clean_vectorised_df.shape[1])

In [None]:
pd.DataFrame(df_train_text_clean_vectorised_df.transpose().dot(df_train['target']) / df_train_text_clean_vectorised_df.sum(axis=0)).sort_values(0, ascending=False)

In [61]:
df_train = df_train.join(df_train_text_clean_vectorised_df, rsuffix='_text')

# Modelling

### Init

In [62]:
lr = LogisticRegression(random_state=42, solver='liblinear')

cv_scores = []

### Prep X, y

In [63]:
# features_to_drop = df_train.select_dtypes(exclude=['number']).columns.to_list()
# features_to_drop.extend(['id'])

# X_train = df_train.drop(columns=features_to_drop+['target'])
# y_train = df_train['target']

In [64]:
features_stats = ['char_count','word_count','unique_word_count','avg_word_length','punctuation_count','stopwords_count','caps_count','n_handles','n_hashtags','n_urls',]
features_polys = list(poly.get_feature_names_out())
features_te_ce = ['keyword_target_ce','location_target_ce']
features_te_skl = ['keyword_target_skl','location_target_skl']
features_cv_hashtags = list(vec_hashtags.get_feature_names_out())
features_cv_handles = list(vec_handles.get_feature_names_out())
features_cv_urls = list(vec_urls.get_feature_names_out())
features_cv = features_cv_hashtags + features_cv_handles + features_cv_urls
features_tv = list(vec_text.get_feature_names_out())

features_to_keep = features_stats + features_te_ce + features_te_skl + features_cv + features_tv
features_to_keep.remove('text')

X_train = df_train[features_to_keep]
y_train = df_train['target']

### First run

In [None]:
cross_val_f1 = cross_val_score(lr, X_train, y_train, cv=5, scoring='f1').mean()
print(f'Cross-validated F1 score:\t{round(cross_val_f1,4)}')
cv_scores.append(('Initial',cross_val_f1))

### SMOTE (pre-scale)

In [None]:
print(f'Positives: {df_train[df_train.target==1].shape[0]} ({round(df_train[df_train.target==1].shape[0]/df_train.shape[0]*100,1)}%)')
print(f'Negatives: {df_train[df_train.target==0].shape[0]} ({round(df_train[df_train.target==0].shape[0]/df_train.shape[0]*100,1)}%)')
print(f'X number of rows: {X_train.shape[0]}')
print()

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print(f'Positives: {int(y_train[y_train==1].count())} ({round(int(y_train[y_train==1].count())/int(y_train.count())*100,1)}%)')
print(f'Negatives: {int(y_train[y_train==0].count())} ({round(int(y_train[y_train==0].count())/int(y_train.count())*100,1)}%)')
print(f'X number of rows: {y_train.shape[0]}')

In [None]:
cross_val_f1 = cross_val_score(lr, X_train, y_train, cv=5, scoring='f1').mean()
print(f'Cross-validated F1 score:\t{round(cross_val_f1,4)}')
cv_scores.append(('SMOTE',cross_val_f1))

### Scale

In [69]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
cross_val_f1 = cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='f1').mean()
print(f'Cross-validated F1 score:\t{round(cross_val_f1,4)}')
cv_scores.append(('Scale',cross_val_f1))

### Logistic Regressor

In [71]:
search_spaces = [{'solver':['liblinear'], 'penalty':['l1','l2'], 'C':(1e-4, 1e4, 'log-uniform')}]
bayessearch_lr = BayesSearchCV(LogisticRegression(random_state=42), search_spaces=search_spaces, n_iter=100, scoring='f1', cv=5, n_jobs=-1)
bayessearch_lr.fit(X_train_scaled, y_train)
print("Best score:", bayessearch_lr.best_score_)
print("Best parameters:", bayessearch_lr.best_params_)

In [72]:
# param_grid = [{'solver':['liblinear'], 'penalty':['l1','l2'], 'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}]
# gridsearch_lr = GridSearchCV(LogisticRegression(random_state=42, max_iter=100), param_grid=param_grid, scoring='f1', cv=5, n_jobs=-1)
# gridsearch_lr.fit(X_train_scaled, y_train)
# print("Best score:", gridsearch_lr.best_score_)
# print("Best parameters:", gridsearch_lr.best_params_)

In [73]:
lr = bayessearch_lr.best_estimator_
# lr = LogisticRegression(random_state=42, C=0.14421478790765738, penalty='l1', solver='liblinear')
# lr = gridsearch_lr.best_estimator_
# lr = LogisticRegression(random_state=42, C=0.1, penalty='l1', solver='liblinear')

In [None]:
cross_val_f1 = cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='f1').mean()
print(f'Cross-validated F1 score:\t{round(cross_val_f1,4)}')
cv_scores.append(('LR',cross_val_f1))

### Feature Selection: Select K Best

In [75]:
selector_pipeline = Pipeline([('select',SelectKBest(score_func=chi2)), ('clf',lr)])
bayes_search_selector = BayesSearchCV(estimator=selector_pipeline, search_spaces={'select__k':(1,X_train_scaled.shape[1])}, n_iter=50, scoring='f1', cv=5, verbose=0, n_jobs=-1)
bayes_search_selector.fit(X_train_scaled, y_train)
print("Best k:", bayes_search_selector.best_params_['select__k'])
print("Best F1 score:", bayes_search_selector.best_score_)
selector_kb = bayes_search_selector.best_estimator_[0]
# selector_kb = SelectKBest(score_func=chi2, k=500)
X_train_scaled = selector_kb.fit_transform(X_train_scaled, y_train)

In [76]:
cross_val_f1 = cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='f1').mean()
print(f'Cross-validated F1 score:\t{round(cross_val_f1,4)}')
cv_scores.append(('Select K Best',cross_val_f1))

### Feature Selection: Variance Threshold

In [77]:
# from sklearn.feature_selection import VarianceThreshold
# selector_vt = VarianceThreshold(threshold=0.01)
# X_train_scaled = selector_vt.fit_transform(X_train_scaled, y_train)

### Feature Selection: RFECV

In [None]:
rfecv = RFECV(estimator=lr, step=10, cv=5, scoring='f1')
rfecv.fit(X_train_scaled, y_train)

plt.figure(figsize=(6,3))
plt.xlabel("Number of features selected")
plt.ylabel("Number of correct classifications)")
plt.plot(rfecv.cv_results_['n_features'], rfecv.cv_results_['mean_test_score'])
plt.show()

print("Optimal number of features:", rfecv.n_features_)
# rfecv_features = rfecv.support_
# print("Selected features:", rfecv_features)
# print("Selected features:", X_train.columns[rfecv_features])
# # print("Feature rankings:", rfecv.ranking_)

X_train_scaled = rfecv.transform(X_train_scaled)

In [None]:
cross_val_f1 = cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='f1').mean()
print(f'Cross-validated F1 score:\t{round(cross_val_f1,4)}')
cv_scores.append(('RFECV',cross_val_f1))

### Alternative: SMOTE (post-scale)

In [None]:
# print(f'Positives: {df_train[df_train.target==1].shape[0]} ({round(df_train[df_train.target==1].shape[0]/df_train.shape[0]*100,1)}%)')
# print(f'Negatives: {df_train[df_train.target==0].shape[0]} ({round(df_train[df_train.target==0].shape[0]/df_train.shape[0]*100,1)}%)')
# print(f'X number of rows: {X_train.shape[0]}')
# print()

# smote = SMOTE(random_state=42)
# X_train_scaled, y_train = smote.fit_resample(X_train_scaled, y_train)

# print(f'Positives: {int(y_train[y_train==1].count())} ({round(int(y_train[y_train==1].count())/int(y_train.count())*100,1)}%)')
# print(f'Negatives: {int(y_train[y_train==0].count())} ({round(int(y_train[y_train==0].count())/int(y_train.count())*100,1)}%)')
# print(f'X number of rows: {y_train.shape[0]}')

In [None]:
# cross_val_f1 = cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='f1').mean()
# print(f'Cross-validated F1 score:\t{round(cross_val_f1,4)}')
# cv_scores.append(('smote',cross_val_f1))

### Cross-validated F1 scores

In [None]:
pd.DataFrame(cv_scores, columns=['stage', 'f1']).round(4)

## Pipeline

In [85]:
# pipeline = Pipeline([
#     ('smote', SMOTE(random_state=42)),
#     ('scaler', MinMaxScaler()),
#     ('feature_selection', SelectKBest(score_func=chi2)),
#     ('clf', LogisticRegression(random_state=42))
# ])

# param_grid = {
#     'feature_selection__k': (1, X_train.shape[1]),
#     'clf__solver': ['liblinear'],
#     'clf__penalty': ['l1', 'l2'],
#     'clf__C': (1e-4, 1e4, 'log-uniform')
# }

# bayes_search = BayesSearchCV(
#     estimator=pipeline,
#     search_spaces=param_grid,
#     n_iter=100,
#     scoring='f1',
#     cv=10,
#     n_jobs=-1,
#     verbose=0
# )

# bayes_search.fit(X_train, y_train)
# best_pipeline = bayes_search.best_estimator_

# rfecv = RFECV(estimator=best_pipeline.named_steps['clf'], step=5, cv=10, scoring='f1')
# X_train_rfecv = rfecv.fit_transform(best_pipeline[:-1].fit_transform(X_train, y_train), y_train)

# cross_val_f1 = cross_val_score(best_pipeline.named_steps['clf'], X_train_scaled, y_train, cv=10, scoring='f1').mean()
# print(f'Cross-validated F1 score:\t{round(cross_val_f1,4)}')

# Submission

In [None]:
# lr.fit(X_train_scaled, y_train)
# y_test = lr.predict(X_test_scaled)

In [None]:
# print(f'Training F1 score:\t{round(f1_score(y_train, lr.predict(X_train_scaled)),4)}')
# print(f'Training accuracy:\t{round(lr.score(X_train_scaled, y_train),4)}')
# cm = confusion_matrix(y_train, lr.predict(X_train_scaled))
# display(pd.DataFrame(cm,index=['Actual Negative', 'Actual Positive'],columns=['Predicted Negative', 'Predicted Positive']))
# display(pd.DataFrame((cm/cm.sum()*100).round(1),index=['Actual Negative (%)', 'Actual Positive (%)'],columns=['Predicted Negative (%)', 'Predicted Positive (%)']))

In [None]:
# from sklearn.model_selection import StratifiedKFold
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# print(f'Cross-validated F1 score:\t{round(cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='f1').mean(),4)}')

In [86]:
# # df_test['cleaned_location'] = df_test['location']
# # df_test['cleaned_location'] = df_test['cleaned_location'].fillna('unknown')
# # df_test['cleaned_location'] = df_test['cleaned_location'].apply(lambda x: clean_locations(x, location_mapping))

# df_test['text_clean'] = df_test['text'].apply(lambda x: remove_newlines(x))
# df_test['text_clean'] = df_test['text_clean'].apply(lambda x: fix_html_entities(x))
# df_test[['text_clean', 'hashtags', 'n_hashtags']] = df_test['text_clean'].apply(lambda x: extract_elements(x,'hashtags')).apply(pd.Series)
# df_test[['text_clean', 'handles', 'n_handles']] = df_test['text_clean'].apply(lambda x: extract_elements(x,'handles')).apply(pd.Series)
# df_test[['text_clean', 'urls', 'n_urls']] = df_test['text_clean'].apply(lambda x: extract_elements(x,'urls')).apply(pd.Series)
# df_test['char_count'] = df_test['text_clean'].apply(lambda x: char_count(x))
# df_test['word_count'] = df_test['text_clean'].apply(lambda x: word_count(x))
# df_test['unique_word_count'] = df_test['text_clean'].apply(lambda x: unique_word_count(x))
# df_test['avg_word_length'] = df_test['text_clean'].apply(lambda x: avg_word_length(x))
# df_test['punctuation_count'] = df_test['text_clean'].apply(lambda x: punctuation_count(x))
# df_test['stopwords_count'] = df_test['text_clean'].apply(lambda x: stopwords_count(x))
# df_test['caps_count'] = df_test['text_clean'].apply(lambda x: caps_count(x))

# df_test, _ = poly_features(df_test, poly=poly)

# df_test = df_test.join(ce_encoder.transform(df_test[features]).add_suffix('_target_ce'))
# df_test = df_test.join(pd.DataFrame(skl_encoder.transform(df_test[features]), columns=[f"{col}_target_skl" for col in features], index=df_test.index))

# df_test_hashtags_vectorised = vec_hashtags.transform(df_test['hashtags'])
# df_test_hashtags_vectorised_df = pd.DataFrame(df_test_hashtags_vectorised.toarray(), columns=vec_hashtags.get_feature_names_out())
# df_test_handles_vectorised = vec_handles.transform(df_test['handles'])
# df_test_handles_vectorised_df = pd.DataFrame(df_test_handles_vectorised.toarray(), columns=vec_handles.get_feature_names_out())
# df_test_urls_vectorised = vec_urls.transform(df_test['urls'])
# df_test_urls_vectorised_df = pd.DataFrame(df_test_urls_vectorised.toarray(), columns=vec_urls.get_feature_names_out())
# df_test_text_clean_vectorised = vec_text.transform(df_test['text_clean'])
# df_test_text_clean_vectorised_df = pd.DataFrame(df_test_text_clean_vectorised.toarray(), columns=vec_text.get_feature_names_out())

# df_test = df_test.join(df_test_hashtags_vectorised_df, rsuffix='_urls')
# df_test = df_test.join(df_test_handles_vectorised_df, rsuffix='_handles')
# df_test = df_test.join(df_test_urls_vectorised_df, rsuffix='_hashtags')
# df_test = df_test.join(df_test_text_clean_vectorised_df, rsuffix='_text')

In [87]:
# X_test = df_test.drop(columns=features_to_drop)
# X_test = df_test[features_to_keep]
# X_test_scaled = scaler.transform(X_test)
# X_test_scaled = selector_kb.transform(X_test_scaled)
# X_test_scaled = selector_vt.transform(X_test_scaled)
# X_test_scaled = rfecv.transform(X_test_scaled)

In [88]:
# lr.fit(X_train, y_train)
# y_pred = lr.predict(X_test)
# submission['target'] = y_pred
# print(submission.shape)
# submission.to_csv('submission_jg_XXX.csv', index=False)

# Helpers

In [89]:
# substring = 'Deeds'
# matches = X_train.astype(str).apply(lambda col: col.str.contains(substring, na=False))
# filtered_rows = X_train[matches.any(axis=1)]
# print("Rows with matches:\n", filtered_rows)
# locations = matches.stack()[matches.stack()]
# print("Locations of matches:\n", locations.index.tolist())

In [90]:
# features = ['features_stats', 'features_polys', 'features_te_ce', 'features_te_skl', 'features_cv_hashtags', 'features_cv_handles', 'features_cv_urls', 'features_tv']

# def all_combinations(iterable):
#     return chain.from_iterable(combinations(iterable, r) for r in range(1, len(iterable) + 1))

# feature_combinations = list(all_combinations(features))