# Ensemble Model
Putting together an ensemble of the average wordvec model and a model using the other features.

In [1]:
import pandas as pd
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...


In [2]:
train_geocodes = pd.read_csv('train_geocodes.csv')
test_geocodes = pd.read_csv('test_geocodes.csv')

In [3]:
train = train.merge(train_geocodes, on=['id'])

In [4]:
test = test.merge(test_geocodes, on=['id'])

In [5]:
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,has_location,geocoded,longitude,latitude
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,False,False,,
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,False,False,,
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,False,False,,
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...,False,False,,
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,False,False,,


In [6]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)
test['longitude_n'] = test['longitude']/180
test['latitude_n'] = test['latitude']/180
train['longitude_n'] = train['longitude']/180
train['latitude_n'] = train['latitude']/180
test.head()

Unnamed: 0,id,keyword,location,text,cleaned_text,has_location,geocoded,longitude,latitude,longitude_n,latitude_n
0,0,0,0,Just happened a terrible car crash,Just happened a terrible car crash,False,False,0.0,0.0,0.0,0.0
1,2,0,0,"Heard about #earthquake is different cities, s...","Heard about earthquake is different cities, st...",False,False,0.0,0.0,0.0,0.0
2,3,0,0,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are...",False,False,0.0,0.0,0.0,0.0
3,9,0,0,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting. Spokane wildfires,False,False,0.0,0.0,0.0,0.0
4,11,0,0,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills in China and Taiwan,False,False,0.0,0.0,0.0,0.0


In [7]:
from preprocessor.defines import Patterns
def fe_pattern(df, pattern, name):
    df[name] = df['text'].str.lower().apply(lambda x: pattern.findall(x))
    df['num_' + name] = df[name].apply(lambda x: len(x))

In [8]:
for df in [train, test]:
    fe_pattern(df, Patterns.HASHTAG_PATTERN,'hash')
    fe_pattern(df, Patterns.MENTION_PATTERN,'mention')
    fe_pattern(df, Patterns.URL_PATTERN,'url')
    df['num_hash_n'] = df['num_hash']/train['num_hash'].max()
    df['num_mention_n'] = df['num_mention']/train['num_mention'].max()
    df['num_url_n'] = df['num_url']/train['num_url'].max()

In [9]:
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,has_location,geocoded,longitude,latitude,...,latitude_n,hash,num_hash,mention,num_mention,url,num_url,num_hash_n,num_mention_n,num_url_n
0,1,0,0,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,False,False,0.0,0.0,...,0.0,[#earthquake],1,[],0,[],0,0.076923,0.0,0.0
1,4,0,0,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,False,False,0.0,0.0,...,0.0,[],0,[],0,[],0,0.0,0.0,0.0
2,5,0,0,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,False,False,0.0,0.0,...,0.0,[],0,[],0,[],0,0.0,0.0,0.0
3,6,0,0,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...,False,False,0.0,0.0,...,0.0,[#wildfires],1,[],0,[],0,0.076923,0.0,0.0
4,7,0,0,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,False,False,0.0,0.0,...,0.0,"[#alaska, #wildfires]",2,[],0,[],0,0.153846,0.0,0.0


In [279]:
train['longitude_t'] = train['longitude']
train['latitude_t'] = train['latitude']
test['longitude_t'] = test['longitude']
test['latitude_t'] = test['latitude']

In [280]:
train.loc[~train['geocoded'], 'longitude_t'] = 2000
test.loc[~train['geocoded'], 'longitude_t'] = 2000
train.loc[~train['geocoded'], 'latitude_t'] = 2000
test.loc[~train['geocoded'], 'latitude_t'] = 2000

In [281]:
train.loc[~train['has_location'], 'longitude_t'] = 1000
test.loc[~train['has_location'], 'longitude_t'] = 1000
train.loc[~train['has_location'], 'latitude_t'] = 1000
test.loc[~train['has_location'], 'latitude_t'] = 1000

In [282]:
train['text_len'] = train['cleaned_text'].apply(len)
test['text_len'] = test['cleaned_text'].apply(len)

## Train a boostedtree with the 'extra' features

In [322]:
feature_names_tree = ['num_hash', 'num_mention', 'num_url','longitude_t', 'latitude_t', 'text_len']

In [323]:
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

In [336]:
rf = LGBMClassifier(n_jobs=-2, max_depth=1, n_estimators=20, random_state=42)

In [337]:
y = train['target']
X = train[feature_names_tree]

In [338]:
from sklearn.model_selection import cross_validate
scores = cross_validate(rf, X, y, cv=5, return_train_score=True, scoring='f1')
scores

{'fit_time': array([0.02151656, 0.01991439, 0.01595521, 0.01485562, 0.0140872 ]),
 'score_time': array([0.00606251, 0.00653648, 0.00554419, 0.00635815, 0.0055511 ]),
 'test_score': array([0.5474339 , 0.56643888, 0.57014254, 0.58690745, 0.62727273]),
 'train_score': array([0.58976289, 0.58360282, 0.58271322, 0.57845567, 0.57250613])}

In [339]:
scores['test_score'].mean()

0.5796390983854832