In [1]:
import os
import preprocessor as p
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import cuda
from transformers import RobertaTokenizer

In [2]:
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
test_data_folder = f"../data/test/Data"
test_dataset_files = os.listdir(test_data_folder)

In [4]:
fields = ['text','tweet_url','timestamp_epochs']
test_dfs = [(pd.read_csv(f"{test_data_folder}/{_}",usecols=fields,engine='python'),_.replace('_clear.csv','')) for _ in test_dataset_files]

In [5]:
len(test_dfs)

20

In [6]:
final_test_df = pd.DataFrame(columns=['text', 'timestamp_epochs', 'tweet_url', 'brand'])
for df, brand in test_dfs:
    n = len(df)
    brand_values = [brand]*n
    new_df = df.assign(brand=pd.Series(brand_values))
    final_test_df = pd.concat([final_test_df, new_df])

In [7]:
final_test_df.shape

(4209101, 4)

In [8]:
final_test_df.columns

Index(['text', 'timestamp_epochs', 'tweet_url', 'brand'], dtype='object')

In [9]:
final_test_df.brand.value_counts()

fedex        245953
sony         223875
macys        218491
walmart      217028
nintendo     214730
mcdonalds    210882
marriot      210367
fitbit       210274
nike         209919
uber         209623
gap          208712
nestle       205200
tesco        205027
samsung      204861
puma         204658
adidas       204088
cocacola     202349
amazon       201660
starbucks    200976
netflix      200428
Name: brand, dtype: int64

In [10]:
def preprocess(txt):
    return p.clean(txt)

In [12]:
final_test_df["text"] = final_test_df["text"].astype(str)

In [13]:
final_test_df.head()

Unnamed: 0,text,timestamp_epochs,tweet_url,brand
0,1.- Cruz Azul\n2.- Monterrey\n3.- AmÃÂ©rica\n...,1546041592,/jesus_cassot/status/1078802745187074051,puma
1,I just discovered this on Poshmark: Kids Puma ...,1546041585,/sephotodesign/status/1078802716996980736,puma
2,Nike > Puma >>>>> Abismos >>>>> Adidas https:/...,1546041584,/SaulitoATM2/status/1078802712756711425,puma
3,Bouta hop on this puma wave,1546041552,/luther_holley/status/1078802579239436289,puma
4,Don't mind it too much I have the body for it ...,1546041539,/ShanklyHero/status/1078802522674991104,puma


In [14]:
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

In [15]:
model = torch.load('../models/best-ft-roberta-painpoint-maxlen35.pt')

In [16]:
def predict(query, model, tokenizer, device="cuda"):
    tokens = tokenizer.encode(query)
    all_tokens = len(tokens)
    tokens = tokens[:tokenizer.model_max_length - 2]
    used_tokens = len(tokens)
    tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0)
    mask = torch.ones_like(tokens)

    with torch.no_grad():
        logits = model(tokens.to(device), attention_mask=mask.to(device))[0]
        probs = logits.softmax(dim=-1)

    fake, real = probs.detach().cpu().flatten().numpy().tolist()
    return real

In [17]:
query = """.- Cruz Azul2.- Monterrey3.- Amrica4.- Tigres5.- Pachuca6.- Chivas7.- Toluca8.- Pumas
"""
predict(query,model,tokenizer)

0.15501761436462402

In [None]:
preds, preds_probas = [],[]
for i, row in final_test_df.iterrows():
    text = row["text"]
    clean_txt = preprocess(text)
    pred = predict(clean_txt,model,tokenizer)
    preds_probas.append(pred)
    if pred >= 0.5:
        preds.append(1)
    else:
        preds.append(0)

In [71]:
pd_df.brand.value_counts()

fedex        245953
sony         223875
macys        218491
walmart      217028
nintendo     214730
mcdonalds    210882
fitbit       210274
nike         209919
uber         209623
nestle       205200
tesco        205027
samsung      204861
puma         204658
adidas       204088
cocacola     202349
amazon       201660
starbucks    200976
netflix      200428
gap           84673
Name: brand, dtype: int64

In [81]:
len(preds)

4209101

In [82]:
final_df = final_test_df.assign(pain_point=pd.Series(preds))

In [83]:
final_df.head()

Unnamed: 0,text,timestamp_epochs,tweet_url,brand,pain_point
0,1.- Cruz Azul\n2.- Monterrey\n3.- AmÃÂ©rica\n...,1546041592,/jesus_cassot/status/1078802745187074051,puma,0
1,I just discovered this on Poshmark: Kids Puma ...,1546041585,/sephotodesign/status/1078802716996980736,puma,0
2,Nike > Puma >>>>> Abismos >>>>> Adidas https:/...,1546041584,/SaulitoATM2/status/1078802712756711425,puma,0
3,Bouta hop on this puma wave,1546041552,/luther_holley/status/1078802579239436289,puma,0
4,Don't mind it too much I have the body for it ...,1546041539,/ShanklyHero/status/1078802522674991104,puma,0


In [84]:
final_df.to_csv("../data/final_dataset_predictions.csv", index=None)