In [1]:
import pandas as pd

COLS = ["id", "lat", "long", "text"]
df_train = pd.read_csv("../data/training.txt", names=COLS)
df_val = pd.read_csv("../data/validation.txt", names=COLS)

df_train.head()

Unnamed: 0,id,lat,long,text
0,119165,51.810067,10.191331,"Seit d Vase: ""Wenn ich kaputt gang, bringt das..."
1,100377,51.918188,10.599245,Haha bin au w isch der amig au so richtig lang...
2,109550,52.711074,9.987374,isch d hiltl dachterrasse amne samstig viel bs...
3,111440,52.386711,11.700612,Ich fühle mich wie die Weimarer Republik... .....
4,116670,52.314631,9.701835,Eui liebschte Lunchidee zum Mitneh? 😬 En Grill...


In [4]:
import re

re.findall(r"#\w+", "bla #aa bla #bb #cc")

['#aa', '#bb', '#cc']

In [5]:
from collections import Counter

cnt = Counter()
for text in df_train.text:
    for htag in re.findall(r"#\w+", text):
        cnt[htag] += 1

In [10]:
all_hashtags = [k for k in cnt if cnt[k] >= 5]
len(all_hashtags)

78

In [11]:
cnt = Counter()
for text in df_train.text:
    for ment in re.findall(r"@\w+", text):
        cnt[ment] += 1

In [16]:
all_ment = [k for k in cnt if cnt[k] >= 5]
len(all_ment)

24

In [17]:
import spacy

nlp = spacy.load("de_core_news_lg")

In [19]:
from tqdm import tqdm

cnt = Counter()
for text in tqdm(df_train.text):
    doc = nlp(text)
    for ent in doc.ents:
        cnt[ent.text.lower()] += 1

100%|██████████| 22583/22583 [05:34<00:00, 67.50it/s]


In [30]:
cnt_ent = cnt

In [32]:
all_ent = [k for k in cnt if cnt[k] >= 20]
len(all_ent)

344

In [71]:
def basic_clean(text):
    text = re.sub(r"@[a-zA-Z0-9äöüÄÖÜß]+", " ", text)
    text = re.sub(r"#[a-zA-Z0-9äöüÄÖÜß]+", " ", text)
    text = re.sub(r"https?://[^ ]+", " ", text)
    text = re.sub(r"www.[^ ]+", " ", text)
    text = re.sub(r"[^a-zA-ZäöüÄÖÜß]", " ", text)
    text = re.sub(" +", " ", text)
    text = text.lower()
    return text.strip()

In [76]:
from collections import defaultdict

word_coords = defaultdict(list)

for text, lat, long in zip(df_train.text, df_train.lat, df_train.long):
    text = basic_clean(text)
    for word in set(text.split(' ')):
        word_coords[word].append((lat, long))

In [77]:
def mean_2d(v):
    sum_0 = sum(x for x, _ in v)
    sum_1 = sum(y for _, y in v)
    return (sum_0 / len(v), sum_1 / len(v))

In [78]:
def l1_norm_2d(x):
    return abs(x[0]) + abs(x[1])

In [79]:
def unbiased_variance_2d(v):
    x_mean = mean_2d(v)
    sum = 0
    for x in v:
        sum += l1_norm_2d((x[0] - x_mean[0], x[1] - x_mean[1])) ** 2
    sum /= len(v) - 1
    return sum

In [80]:
word_stats = []

for word, coords in word_coords.items():
    if len(coords) < 2:
        continue
    var = unbiased_variance_2d(coords)
    word_stats.append((word, var, len(coords)))

In [81]:
sorted_word_stats = sorted(word_stats, key=lambda x: (x[1], -x[2]))

In [116]:
filtered_word_stats = [x for x in sorted_word_stats if x[2] >= 5 and x[1] <= 2]

In [121]:
len(filtered_word_stats)

2984

In [122]:
all_location_words = [x[0] for x in filtered_word_stats]

In [123]:
idx = 0
hashtags_idx = {}
for htag in all_hashtags:
    hashtags_idx[htag] = idx
    idx += 1
ment_idx = {}
for ment in all_ment:
    ment_idx[ment] = idx
    idx += 1
ent_idx = {}
for ent in all_ent:
    ent_idx[ent] = idx
    idx += 1
word_idx = {}
for word in all_location_words:
    word_idx[word] = idx
    idx += 1

In [124]:
feat_size = idx
feat_size

3430

In [125]:
import numpy as np

def get_feat(texts):
    feats = np.zeros((len(texts), feat_size))
    for i, text in enumerate(tqdm(texts)):
        for htag in re.findall(r"#\w+", text):
            if htag in hashtags_idx:
                feats[i, hashtags_idx[htag]] += 1
        for ment in re.findall(r"@\w+", text):
            if ment in ment_idx:
                feats[i, ment_idx[ment]] += 1
        doc = nlp(text)
        for ent in doc.ents:
            entity = ent.text.lower()
            if entity in ent_idx:
                feats[i, ent_idx[entity]] += 1
        text = basic_clean(text)
        for word in text.split(' '):
            if word in word_idx:
                feats[i, word_idx[word]] += 1
    return feats

In [126]:
train_feat = get_feat(df_train.text)
val_feat = get_feat(df_val.text)

100%|██████████| 22583/22583 [05:38<00:00, 66.63it/s]
100%|██████████| 3044/3044 [00:45<00:00, 66.40it/s]


In [135]:
np.sum(~train_feat.any(1))

2935

In [136]:
from sklearn.preprocessing import normalize

train_feat_norm = normalize(train_feat)
val_feat_norm = normalize(val_feat)

In [137]:
train_lat = np.array(df_train.lat)
train_long = np.array(df_train.long)
val_lat = np.array(df_val.lat)
val_long = np.array(df_val.long)


In [138]:
def mae_coordinates(true, predicted):
    mae = np.abs(true - predicted).mean(axis=0)
    return (mae[0] + mae[1]) / 2

In [139]:
from sklearn.svm import SVR

C = 1.0
svr_lat = SVR(C=C)
svr_long = SVR(C=C)

In [None]:
svr_lat.fit(train_feat_norm, train_lat)

In [None]:
svr_long.fit(train_feat_norm, train_long)