# Metafeatures

## Loading data

In [72]:
import pandas as pd
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...


Joining geocode information:

In [73]:
train_geocodes = pd.read_csv('train_geocodes.csv')
test_geocodes = pd.read_csv('test_geocodes.csv')

In [74]:
train = train.merge(train_geocodes, on=['id'])
test = test.merge(test_geocodes, on=['id'])

In [75]:
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,has_location,geocoded,longitude,latitude
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,False,False,,
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,False,False,,
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,False,False,,
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...,False,False,,
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,False,False,,


## Geocode feature engineering

In [76]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

### Generating normalized geocodes

In [77]:
test['longitude_n'] = test['longitude']/180
test['latitude_n'] = test['latitude']/180
train['longitude_n'] = train['longitude']/180
train['latitude_n'] = train['latitude']/180
test.head()

Unnamed: 0,id,keyword,location,text,cleaned_text,has_location,geocoded,longitude,latitude,longitude_n,latitude_n
0,0,0,0,Just happened a terrible car crash,Just happened a terrible car crash,False,False,0.0,0.0,0.0,0.0
1,2,0,0,"Heard about #earthquake is different cities, s...","Heard about earthquake is different cities, st...",False,False,0.0,0.0,0.0,0.0
2,3,0,0,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are...",False,False,0.0,0.0,0.0,0.0
3,9,0,0,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting. Spokane wildfires,False,False,0.0,0.0,0.0,0.0
4,11,0,0,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills in China and Taiwan,False,False,0.0,0.0,0.0,0.0


### Geocode encoding for tree-based models
* No geocode: longitude, lattitude = 2000
* No location: longitude, lattitude = 1000

In [78]:
for df in [train,test]:
    df['longitude_t'] = df['longitude']
    df['latitude_t'] = df['latitude']
    df.loc[~df['geocoded'], 'longitude_t'] = 2000
    df.loc[~df['geocoded'], 'latitude_t'] = 2000
    df.loc[~df['has_location'], 'longitude_t'] = 1000
    df.loc[~df['has_location'], 'latitude_t'] = 1000

## Tweet meta-information

In [79]:
from preprocessor.defines import Patterns
def fe_pattern(df, pattern, name):
    df[name] = df['text'].str.lower().apply(lambda x: pattern.findall(x))
    df['num_' + name] = df[name].apply(lambda x: len(x))

In [80]:
for df in [train, test]:
    fe_pattern(df, Patterns.HASHTAG_PATTERN,'hash')
    fe_pattern(df, Patterns.MENTION_PATTERN,'mention')
    fe_pattern(df, Patterns.URL_PATTERN,'url')
    df['num_hash_n'] = df['num_hash']/train['num_hash'].max()
    df['num_mention_n'] = df['num_mention']/train['num_mention'].max()
    df['num_url_n'] = df['num_url']/train['num_url'].max()

In [81]:
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,has_location,geocoded,longitude,latitude,...,latitude_t,hash,num_hash,mention,num_mention,url,num_url,num_hash_n,num_mention_n,num_url_n
0,1,0,0,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,False,False,0.0,0.0,...,1000.0,[#earthquake],1,[],0,[],0,0.076923,0.0,0.0
1,4,0,0,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,False,False,0.0,0.0,...,1000.0,[],0,[],0,[],0,0.0,0.0,0.0
2,5,0,0,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,False,False,0.0,0.0,...,1000.0,[],0,[],0,[],0,0.0,0.0,0.0
3,6,0,0,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...,False,False,0.0,0.0,...,1000.0,[#wildfires],1,[],0,[],0,0.076923,0.0,0.0
4,7,0,0,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,False,False,0.0,0.0,...,1000.0,"[#alaska, #wildfires]",2,[],0,[],0,0.153846,0.0,0.0


text length features

In [82]:
for df in [train, test]:
    df['num_char'] = df['cleaned_text'].apply(len)
    df['num_words'] = df['cleaned_text'].apply(lambda x: len(x.split()))
    df['num_char_n'] = df['num_char']/train['num_char'].max()
    df['num_words_n'] = df['num_words']/train['num_words'].max()

## Store meta-features

In [85]:
columns_tree = ['id', 'num_char', 'num_words', 'num_hash', 'num_mention', 'num_url', 
                'has_location','geocoded','longitude_t', 'latitude_t']
columns_normalized = ['id', 'num_char_n', 'num_words_n', 'num_hash_n', 'num_mention_n', 'num_url_n', 
                      'has_location','geocoded','longitude_n', 'latitude_n']

In [86]:
train[columns_tree+['target']].to_csv('train_metafeatures_tree.csv', index=False)
test[columns_tree].to_csv('test_metafeatures_tree.csv', index=False)
train[columns_normalized+['target']].to_csv('train_metafeatures_normalized.csv',index=False)
test[columns_normalized].to_csv('test_metafeatures_normalized.csv', index=False)