In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader

from PreprocessData import *

from transformers import LongformerForSequenceClassification, LongformerTokenizer

In [43]:
train_file_path = 'C:\\Users\\user\\Programming\\Git\\challenge_data\\train_tweets\\ArgentinaBelgium72.csv'

df = pd.read_csv(train_file_path)
df = pd.concat([df[:10], df[20000:20010]], axis=0)
df = df[['PeriodID', 'EventType', 'Tweet']]

# Apply preprocessing to each tweet
df['Tweet'] = df['Tweet'].apply(preprocess_text)
print("1: ", df)


1:         PeriodID  EventType                                              Tweet
0             0          0  rt woridcup argentina v belgium win httptcoleu...
1             0          0    elijahman_ time focus belgium winning world cup
2             0          0  rt fifaworldcup global stadium joinin worldcup...
3             0          0  rt catholicnewssvc popefrancis uhoh argentina ...
4             0          0  rt soccerdotcom score v bel well award messisi...
5             0          0  rt soccerdotcom score v bel well award messisi...
6             0          0  hope argentina lose would fun see belgium go f...
7             0          0  watch argentina v belgium th july live go link...
8             0          0                         jrmunz dont like argentina
9             0          0  even though hate belgium beating u waffle damn...
20000         9          1  rt espnfc wc maradona scored twice args win be...
20001         9          1  rt docceng argentina owns stadiu

In [39]:
# Tokenize the 'Tweet' column and pad/truncate to length l
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
l = 10
def tokenize_tweet(tweet):
    tokens = tokenizer.encode(tweet, truncation=True, padding="max_length", max_length=l, add_special_tokens=True)
    return tokens
df['Tweet'] = df['Tweet'].apply(tokenize_tweet)
print("2: ", df)

2:         PeriodID  EventType                                              Tweet
0             0          0  [0, 9713, 45155, 808, 21033, 29480, 1342, 1243...
1             0          0  [0, 523, 44303, 397, 1215, 86, 1056, 12138, 57...
2             0          0  [0, 9713, 45783, 1584, 39949, 21033, 720, 4773...
3             0          0  [0, 9713, 31420, 12589, 4651, 7485, 36940, 349...
4             0          0  [0, 9713, 4191, 33369, 175, 1471, 748, 12138, ...
5             0          0  [0, 9713, 4191, 33369, 175, 1471, 748, 12138, ...
6             0          0  [0, 298, 9877, 29480, 1342, 1243, 2217, 74, 15...
7             0          0  [0, 11018, 29480, 1342, 1243, 748, 12138, 571,...
8             0          0  [0, 267, 338, 20614, 329, 33976, 101, 29480, 1...
9             0          0  [0, 12963, 600, 4157, 12138, 571, 4031, 4108, ...
20000         9          1  [0, 9713, 21179, 282, 25484, 885, 438, 4401, 6...
20001         9          1  [0, 9713, 22053, 438, 3314, 2948

In [40]:
df_g = df.groupby(['EventType','PeriodID'])['Tweet'].apply(list).reset_index()
df_g['Tweet'] = df_g['Tweet'].apply(lambda x: list(itertools.chain.from_iterable(x)))
print("3: ", df_g)

3:     EventType  PeriodID                                              Tweet
0          0         0  [0, 9713, 45155, 808, 21033, 29480, 1342, 1243...
1          1         9  [0, 9713, 21179, 282, 25484, 885, 438, 4401, 6...


In [None]:
def split_into_pieces(arr, piece_size=6):
    arr = arr[:len(arr) - (len(arr) % piece_size)]
    return [arr[i:i + piece_size] for i in range(0, len(arr), piece_size)]

df_g['Tweet'] = df_g['Tweet'].apply(split_into_pieces)
df_g = df_g.explode('Tweet').reset_index(drop=True)
print("4: ", df_g)


4:      EventType  PeriodID                                  Tweet
0           0         0    [0, 9713, 45155, 808, 21033, 29480]
1           0         0           [1342, 1243, 748, 2, 0, 523]
2           0         0    [44303, 397, 1215, 86, 1056, 12138]
3           0         0         [571, 2, 0, 9713, 45783, 1584]
4           0         0     [39949, 21033, 720, 4773, 1962, 2]
5           0         0    [0, 9713, 31420, 12589, 4651, 7485]
6           0         0        [36940, 3495, 4550, 2, 0, 9713]
7           0         0   [4191, 33369, 175, 1471, 748, 12138]
8           0         0         [157, 2, 0, 9713, 4191, 33369]
9           0         0        [175, 1471, 748, 12138, 157, 2]
10          0         0      [0, 298, 9877, 29480, 1342, 1243]
11          0         0          [2217, 74, 1531, 2, 0, 11018]
12          0         0   [29480, 1342, 1243, 748, 12138, 571]
13          0         0          [4031, 2, 0, 267, 338, 20614]
14          0         0      [329, 33976, 101, 2948

In [42]:
df_g = df_g[['EventType', 'Tweet']]
print("5: ", df_g)

5:      EventType                                  Tweet
0           0    [0, 9713, 45155, 808, 21033, 29480]
1           0           [1342, 1243, 748, 2, 0, 523]
2           0    [44303, 397, 1215, 86, 1056, 12138]
3           0         [571, 2, 0, 9713, 45783, 1584]
4           0     [39949, 21033, 720, 4773, 1962, 2]
5           0    [0, 9713, 31420, 12589, 4651, 7485]
6           0        [36940, 3495, 4550, 2, 0, 9713]
7           0   [4191, 33369, 175, 1471, 748, 12138]
8           0         [157, 2, 0, 9713, 4191, 33369]
9           0        [175, 1471, 748, 12138, 157, 2]
10          0      [0, 298, 9877, 29480, 1342, 1243]
11          0          [2217, 74, 1531, 2, 0, 11018]
12          0   [29480, 1342, 1243, 748, 12138, 571]
13          0          [4031, 2, 0, 267, 338, 20614]
14          0      [329, 33976, 101, 29480, 1342, 2]
15          0      [0, 12963, 600, 4157, 12138, 571]
16          0                  [4031, 4108, 1717, 2]
17          1      [0, 9713, 21179, 282, 2