# Tweets timeline

Plots of temporal tendency of tweets by sentiment and political coalitions

In [4]:
%pip install pyathena "pymongo[srv]" wordcloud

Collecting wordcloud
  Downloading wordcloud-1.8.1.tar.gz (220 kB)
     |████████████████████████████████| 220 kB 7.9 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: wordcloud
  Building wheel for wordcloud (setup.py) ... [?25ldone
[?25h  Created wheel for wordcloud: filename=wordcloud-1.8.1-cp39-cp39-macosx_11_0_arm64.whl size=152132 sha256=7613a565a529f5713743fd0f6a2e57c3976ec24759b13a495de02d9235f37c1a
  Stored in directory: /Users/jose/Library/Caches/pip/wheels/f9/7a/dd/06ef8b5dfe5483f6204133c08eeb16c287cc2c05e290ae2fc0
Successfully built wordcloud
Installing collected packages: wordcloud
Successfully installed wordcloud-1.8.1
Note: you may need to restart the kernel to use updated packages.


In [11]:
import pymongo
import pandas as pd
import tqdm
from pyathena import connect
import os
from wordcloud import WordCloud
import plotly.express as px

mongo_client = pymongo.MongoClient(os.environ["MONGODB_URL"])
twitter_db = mongo_client.TwitterConstituyenteDB

In [6]:
conn = connect(s3_staging_dir=os.environ["AWS_ATHENA_S3_STAGING_DIR"], region_name=os.environ["AWS_REGION"])

query = """
SELECT * FROM "twitter-constituyente"."constituyentes";
"""
df = pd.read_sql(query, conn)
candidates_ids = df["user__id_str"].dropna().to_list()



In [7]:
lista_del_pueblo = [
  "XC. A PULSO, POR EL BUEN VIVIR (D27)",
  "WJ. ASAMBLEA CONSTITUYENTE ATACAMA",
  "J. ELIGE LA LISTA DEL PUEBLO (D23)",
  "XJ. FUERZA SOCIAL DE ÑUBLE, LA LISTA DEL PUEBLO (D19)",
  "S. INDEPENDIENTES DISTRITO 6 + LISTA DEL PUEBLO (D6)",
  "ZN. LA LISTA DEL PUEBLO (D10)",
  "ZN. LA LISTA DEL PUEBLO (D13)",
  "ZN. LA LISTA DEL PUEBLO (D17)",
  "ZN. LA LISTA DEL PUEBLO (D3)",
  "ZN. LA LISTA DEL PUEBLO (D7)",
  "ZN. LA LISTA DEL PUEBLO (D8)",
  "YP. LA LISTA DEL PUEBLO 100% INDEPENDIENTES (D15)",
  "YL. LA LISTA DEL PUEBLO DISTRITO 12 (D12)",
  "ZN. LA LISTA DEL PUEBLO DISTRITO 14 (D14)",
  "N. LA LISTA DEL PUEBLO DISTRITO 9 (D9)",
  "ZD. LA LISTA DEL PUEBLO MAULE SUR (D18)",
  "ZN. LA LISTA DEL PUEBLO(D20)",
  "WD. LISTA DEL PUEBLO - MOVIMIENTO TERRITORIAL CONSTITUYENTE (D5)",
  "Q. LISTA DEL PUEBLO TRANSFORMANDO DESDE EL WILLI (D25)",
  "XD. LISTA DEL PUEBLO-RIOS INDEPENDIENTES (D24)",
  "XT. MOVIMIENTO SOCIAL CONSTITUYENTE / LA LISTA DEL PUEBLO (D11)",
  "ZE. MOVIMIENTO SOCIAL LA LISTA DEL PUEBLO (D22)"
]

no_neutrales = [
  "YF. INDEPENDIENTES DE ÑUBLE POR LA NUEVA CONSTITUCION (D19)",
  "I. INDEPENDIENTES DEL BIOBIO POR UNA NUEVA CONSTITUCION (D20)",
  "ZA. INDEPENDIENTES NUEVA CONSTITUCION (D26)",
  "ZT. INDEPENDIENTES POR LA NUEVA CONSTITUCION (D10)",
  "ZT. INDEPENDIENTES POR LA NUEVA CONSTITUCION (D11)",
  "ZT. INDEPENDIENTES POR LA NUEVA CONSTITUCION (D14)",
  "ZT. INDEPENDIENTES POR LA NUEVA CONSTITUCION (D17)",
  "ZT. INDEPENDIENTES POR LA NUEVA CONSTITUCION (D22)",
  "ZT. INDEPENDIENTES POR LA NUEVA CONSTITUCION (D23)",
  "ZT. INDEPENDIENTES POR LA NUEVA CONSTITUCION (D24)",
  "ZT. INDEPENDIENTES POR LA NUEVA CONSTITUCION (D25)",
  "ZT. INDEPENDIENTES POR LA NUEVA CONSTITUCION (D4)",
  "ZT. INDEPENDIENTES POR LA NUEVA CONSTITUCION (D6)",
  "YV. INDEPENDIENTES POR UNA NUEVA CONSTITUCION (D1)",
  "YV. INDEPENDIENTES POR UNA NUEVA CONSTITUCION (D12)",
  "YV. INDEPENDIENTES POR UNA NUEVA CONSTITUCION (D21)",
  "YV. INDEPENDIENTES POR UNA NUEVA CONSTITUCION (D7)",
  "YV. INDEPENDIENTES POR UNA NUEVA CONSTITUCION (D9)",
  "XR. MAGALLANICOS NO NEUTRALES (D28)"
]

def coalicion_politica(lista):
  if lista == "YQ. APRUEBO DIGNIDAD":
    return "Apruebo Dignidad"
  if lista == "XP. VAMOS POR CHILE":
    return "Vamos por Chile"
  if lista == "YB. LISTA DEL APRUEBO":
    return "Lista del Apruebo"
  if lista in lista_del_pueblo:
    return "Lista del Pueblo"
  if lista in no_neutrales:
    return "Independientes No Neutrales"
  return "Otro"


In [8]:
tweets_constituyentes = list(twitter_db.tweets.find(
    {"user.id_str": { "$in": candidates_ids },
      "retweeted_status": { "$exists": False }}))
tweets_constituyentes_df = pd.DataFrame(tweets_constituyentes)
sentiments = list(twitter_db.sentiment_analysis.find({}))
sentiments_df = pd.DataFrame(sentiments)

In [9]:
merged_df = tweets_constituyentes_df.merge(sentiments_df, left_on="id_str", right_on="tweet__id_str").merge(df, on="user__id_str")
merged_df["date"] = merged_df["datetime"].dt.date

merged_df["coalicion"] = merged_df["list"].apply(coalicion_politica)
merged_df["corrected_output"] = merged_df.apply(lambda row: row["output"] if row[row["output"]] > 0.99 else "NEU", axis=1)

## Sentiment Analysis

In [20]:
px.histogram(merged_df, "datetime", color="output", title="Number of tweets by sentiment from candidates for the Constitutional Convention")

In [21]:
plot_df = merged_df.groupby(["date", "output"]).agg({"id_str":"count"}).reset_index()

fig = px.area(plot_df, x="date", y="id_str", color="output", groupnorm="percent", title="Percentage of tweets by sentiment from candidates for the Constitutional Convention")
fig

In [22]:

px.line(merged_df.groupby(["date", "output"]).agg({"retweet_count":"sum"}).reset_index(), x="date", y="retweet_count", color="output", title="Number of retweets by sentiment of tweets from candidates for the Constitutional Convention")

In [23]:
px.line(merged_df.groupby(["date", "output"]).agg({"favorite_count":"sum"}).reset_index(), x="date", y="favorite_count", color="output", title="Number of likes by sentiment of tweets from candidates for the Constitutional Convention")

In [25]:
merged_df.groupby("output").agg({"retweet_count":["mean", "std"], "favorite_count":["mean", "std"]})

Unnamed: 0_level_0,retweet_count,retweet_count,favorite_count,favorite_count
Unnamed: 0_level_1,mean,std,mean,std
output,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
NEG,31.191493,138.015354,74.126521,340.948027
NEU,12.086338,89.32281,32.285202,218.817777
POS,8.448323,56.906431,30.670967,194.094453


In [27]:
merged_df[merged_df["in_reply_to_status_id"].isna()].groupby("user__id_str").count()["id_str"].quantile(0.9)

296.0

In [28]:
tweet_count = merged_df[merged_df["in_reply_to_status_id"].isna()].groupby("user__id_str").count()["id_str"]

In [29]:
to_delete = tweet_count[tweet_count<2]

In [30]:
df[df["user__id_str"].isin(to_delete.index)]

Unnamed: 0,electoral_district,code_list,list,party,number,code_candidate,name,age,gender,occupation,votes,district_percentage,elected,social_media,twitter,user__id_str,list_number
59,2,XP,XP. VAMOS POR CHILE,IND-RN,22,XP-22,LUZ ELIANA NUÑEZ SALAZAR,54.0,M,"Ingeniero en RR.HH, Universitaria completa",468,0.59,0,"['https://twitter.com/Luznuez69268799', 'https...",luznuez69268799,1186605494997278721,3
119,3,ZZ,ZZ. MOVIMIENTO INDEPENDIENTES DEL NORTE (D3),IND,39,ZZ-39,ARTURO ALEJANDRO SOTO AGUILERA,40.0,H,Fundador de Club Budeo,4490,2.89,0,"['https://www.facebook.com/arturobudeo', 'http...",budeosoto,1354458486151782400,6
168,5,WI,WI. INDEPENDIENTES DEL APRUEBO REGION COQUIMBO,IND,17,WI-17,NATHALY ANDREA OLIVARES BIGNANI,35.0,M,"Psicóloga, Magíster en Liderazgo y Comunicació...",4387,1.95,0,"['https://twitter.com/NathyOliv1', 'http://www...",nathyoliv1,461215010,3
345,8,XA,XA. PARTIDO ECOLOGISTA VERDE,PEV,4,XA-4,HECTOR HERNAN ORELLANA CORTES,35.0,H,"Geógrafo, Doctorado en geografía en la PUC",4334,0.96,0,['https://www.facebook.com/Hector-Orellana-Cor...,hectorconstitu2,1352666315153158153,4
409,9,XA,XA. PARTIDO ECOLOGISTA VERDE,PEV,9,XA-9,RAUL ALFREDO CORTES CASTILLO,21.0,H,Estudiante de Ingeniería Forestal en la Univer...,1022,0.32,0,"['https://twitter.com/RaulcortesD9', 'https://...",raulcortesd9,1352068203883737096,2
412,9,XA,XA. PARTIDO ECOLOGISTA VERDE,PEV,12,XA-12,PAULA STEFANI MORA DA SILVA,24.0,M,Estudiante de Psicología en la Universidad Cat...,2068,0.65,0,"['https://www.facebook.com/paula.dasilvamora',...",nebulosaconsti9,1364768918464831491,5
413,9,XA,XA. PARTIDO ECOLOGISTA VERDE,PEV,13,XA-13,ADRIAN ALBERTO VALENCIA VIDAL,47.0,H,Gestor Cultural y folklorista,1063,0.34,0,['https://www.facebook.com/profile.php?id=1000...,avalenciavidal,1357195343025344512,6
414,9,XP,XP. VAMOS POR CHILE,UDI,14,XP-14,SOL LETELIER GONZALEZ,65.0,M,"Profesora de historia, Educación Superior",7818,2.47,0,"['https://www.instagram.com/letelier.sol/', 'h...",solletelier,2583313389,1
484,10,YB,YB. LISTA DEL APRUEBO,IND-PL,28,YB-28,CAROLINA VANESSA PARRAGUEZ PIÑA,35.0,M,"Abogada, Magíster Derecho Ambiental Universida...",1961,0.46,0,['https://www.instagram.com/carolinaparraguezc...,carolina_d10,1359883671340261378,7
542,11,XF,XF. ENERGIA INDEPENDIENTE (D11),IND,8,XF-8,PAULA ANDREA SALAMANCA ALCAIDE,57.0,M,"Terapeuta Integral Complementaria y coach, Mag...",910,0.24,0,['https://www.facebook.com/PaulaSalamancaDistr...,scl_paula,427905688,3


In [31]:
plot_df = merged_df.groupby(["date", "corrected_output", "elected", "user__id_str"]).agg({"id_str":"count", "retweet_count": "sum"}).reset_index()
# plot_df[""] = plot_df["id_str"].rolling(window=14).mean()
# plot_df['count_MA'] = plot_df.groupby('coalicion')['id_str'].transform(lambda x: x.rolling(14,1).mean())
# plot_df
piv_df = plot_df.pivot(index=['date', 'elected', 'user__id_str'], columns='corrected_output', values=['id_str', 'retweet_count'])
piv_df.columns = [f"{a}__{b}" for a, b in piv_df.columns]
piv_df = piv_df.reset_index() 
piv_df


Unnamed: 0,date,elected,user__id_str,id_str__NEG,id_str__NEU,id_str__POS,retweet_count__NEG,retweet_count__NEU,retweet_count__POS
0,2021-01-01,0,1004762296932069377,,3.0,,,0.0,
1,2021-01-01,0,101614836,,2.0,1.0,,3.0,39.0
2,2021-01-01,0,102121998,,7.0,,,11.0,
3,2021-01-01,0,104065341,,,1.0,,,0.0
4,2021-01-01,0,104210159,,1.0,,,0.0,
...,...,...,...,...,...,...,...,...,...
37023,2021-05-16,1,51845139,,1.0,,,673.0,
37024,2021-05-16,1,55273142,,1.0,,,40.0,
37025,2021-05-16,1,568717018,,3.0,7.0,,0.0,408.0
37026,2021-05-16,1,722580618144587776,,,1.0,,,4.0


In [36]:
px.scatter(merged_df,"retweet_count", "favorite_count", color="possibly_sensitive")

KeyError: (nan, '', '', '', '')

## Political coalitions

In [34]:
plot_df = merged_df.groupby(["date", "corrected_output", "coalicion"]).agg({"id_str":"count"}).reset_index()
# plot_df[""] = plot_df["id_str"].rolling(window=14).mean()
# plot_df['count_MA'] = plot_df.groupby('coalicion')['id_str'].transform(lambda x: x.rolling(14,1).mean())

piv_df = plot_df.pivot(index=['date', 'coalicion'], columns='corrected_output', values='id_str')
piv_df["%POS"] = piv_df["POS"] / (piv_df["POS"] + piv_df["NEG"])
piv_df["%NEG"] = piv_df["NEG"] / (piv_df["POS"] + piv_df["NEG"])
piv_df['%NEG_MA7'] = piv_df.groupby('coalicion')['%NEG'].transform(lambda x: x.rolling(7, 1).mean())
piv_df["NEG/POS"] = piv_df["NEG"] / piv_df["POS"]
piv_df['NEG/POS_MA7'] = piv_df.groupby('coalicion')['NEG/POS'].transform(lambda x: x.rolling(7, 1).mean())

# piv_df['%NEG_MA7'] = piv_df["%NEG"].rolling(window=7).mean()
# piv_df['%POS_MA7'] = piv_df["%POS"].rolling(window=7).mean()
piv_df = piv_df.reset_index()


In [35]:
count_plot = plot_df[plot_df["corrected_output"]=="NEG"]
count_plot['count_MA'] = count_plot.groupby('coalicion')['id_str'].transform(lambda x: x.rolling(7,1).mean())


px.line(count_plot, "date", "count_MA", color="coalicion")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
px.area(count_plot, "date", "count_MA", color="coalicion", groupnorm="percent")

In [None]:
px.line(piv_df, "date", "NEG/POS_MA7", color="coalicion")

In [None]:
px.line(piv_df, "date", "%NEG_MA7", color="coalicion")

In [None]:
px.line(piv_df, "date", "%NEG_MA7", color="coalicion")

In [None]:
plot_df = merged_df.groupby(["date", "corrected_output", "coalicion"]).agg({"retweet_count":"sum"}).reset_index()
piv_df = plot_df.pivot(index=['date', 'coalicion'], columns='corrected_output', values='retweet_count')
piv_df["%POS"] = piv_df["POS"] / (piv_df["POS"] + piv_df["NEG"])
piv_df["%NEG"] = piv_df["NEG"] / (piv_df["POS"] + piv_df["NEG"])
piv_df['%NEG_MA7'] = piv_df.groupby('coalicion')['%NEG'].transform(lambda x: x.rolling(7, 1).mean())
piv_df["NEG/POS"] = piv_df["NEG"] / piv_df["POS"]
piv_df['NEG/POS_MA7'] = piv_df.groupby('coalicion')['NEG/POS'].transform(lambda x: x.rolling(7, 1).mean())

piv_df['NEG_MA7'] = piv_df.groupby('coalicion')['NEG'].transform(lambda x: x.rolling(7, 1).mean())

# piv_df['%NEG_MA7'] = piv_df["%NEG"].rolling(window=7).mean()
# piv_df['%POS_MA7'] = piv_df["%POS"].rolling(window=7).mean()
piv_df = piv_df.reset_index()
piv_df
px.line(piv_df, "date", "%NEG_MA7", color="coalicion")

In [None]:
px.area(piv_df, x="date", y="NEG_MA7", color="coalicion", groupnorm="percent")

In [None]:
plot_df = merged_df.groupby(["date", "corrected_output", "elected", "coalicion"]).agg({"id_str":"count"}).reset_index()
# plot_df[""] = plot_df["id_str"].rolling(window=14).mean()
# plot_df['count_MA'] = plot_df.groupby('coalicion')['id_str'].transform(lambda x: x.rolling(14,1).mean())

piv_df = plot_df.pivot(index=['date', 'elected', 'coalicion'], columns='corrected_output', values='id_str')
piv_df["%POS"] = piv_df["POS"] / (piv_df["POS"] + piv_df["NEG"])
piv_df["%NEG"] = piv_df["NEG"] / (piv_df["POS"] + piv_df["NEG"])
piv_df['%NEG_MA7'] = piv_df.groupby(['elected', 'coalicion'])['%NEG'].transform(lambda x: x.rolling(7, 1).mean())
piv_df["NEG/POS"] = piv_df["NEG"] / piv_df["POS"]
piv_df['NEG/POS_MA7'] = piv_df.groupby(['elected', 'coalicion'])['NEG/POS'].transform(lambda x: x.rolling(7, 1).mean())

# piv_df['%NEG_MA7'] = piv_df["%NEG"].rolling(window=7).mean()
# piv_df['%POS_MA7'] = piv_df["%POS"].rolling(window=7).mean()
piv_df = piv_df.reset_index()
piv_df["coalicion_elected"] = piv_df["coalicion"] + "_" + piv_df["elected"].astype("str")
px.line(piv_df, "date", "%NEG_MA7", color="coalicion_elected")

In [None]:
plot_df = merged_df.groupby(["corrected_output", "coalicion", "user__id_str", "elected"]).agg({"id_str":"count"}).reset_index()
piv_df = plot_df.pivot(index=['coalicion', "user__id_str", "elected"], columns='corrected_output', values='id_str')
piv_df = piv_df.reset_index()
piv_df["%NEG"] = piv_df["NEG"] / (piv_df["POS"] + piv_df["NEG"])
piv_df_grouped = piv_df.groupby(["coalicion", "elected"]).agg({"%NEG": ["mean", "median"]})
# px.bar(piv_df, "coalicion", "")
piv_df_grouped.columns = [f"{a}__{b}" for a, b in piv_df_grouped.columns]
piv_df_grouped = piv_df_grouped.reset_index()
piv_df_grouped["elected"] = piv_df_grouped["elected"].astype("category")
px.bar(piv_df_grouped, "coalicion", "%NEG__mean",color="elected", barmode='group')

In [None]:
px.bar(piv_df_grouped, "coalicion", "%NEG__median",color="elected", barmode='group')

In [None]:
plot_df = merged_df.groupby(["date", "corrected_output", "elected", "coalicion", "user__id_str"]).agg({"id_str":"count"}).reset_index()
# plot_df[""] = plot_df["id_str"].rolling(window=14).mean()
# plot_df['count_MA'] = plot_df.groupby('coalicion')['id_str'].transform(lambda x: x.rolling(14,1).mean())

piv_df = plot_df.pivot(index=['date', 'elected', 'coalicion', "user__id_str"], columns='corrected_output', values='id_str')
piv_df["%POS"] = piv_df["POS"] / (piv_df["POS"] + piv_df["NEG"])
piv_df["%NEG"] = piv_df["NEG"] / (piv_df["POS"] + piv_df["NEG"])
# piv_df["NEG/POS"] = piv_df["NEG"] / piv_df["POS"]
piv_df_grouped = piv_df.groupby(['date', 'elected', 'coalicion']).agg({"%NEG": ["mean", "median"]})

# piv_df['NEG/POS_MA7'] = piv_df.groupby(['elected', 'coalicion'])['NEG/POS'].transform(lambda x: x.rolling(7, 1).mean())

# piv_df['%NEG_MA7'] = piv_df["%NEG"].rolling(window=7).mean()
# piv_df['%POS_MA7'] = piv_df["%POS"].rolling(window=7).mean()
# piv_df = piv_df.reset_index()
# px.line(piv_df, "date", "%NEG_MA7", color="coalicion_elected")
piv_df_grouped.columns = [f"{a}__{b}" for a, b in piv_df_grouped]
piv_df_grouped = piv_df_grouped.reset_index().fillna(0)
piv_df_grouped['%NEG__mean_MA7'] = piv_df_grouped.groupby(['elected', 'coalicion'])['%NEG__mean'].transform(lambda x: x.rolling(7, 1).mean())

piv_df_grouped["coalicion_elected"] = piv_df_grouped["coalicion"] + "_" + piv_df_grouped["elected"].astype("str")

px.line(piv_df_grouped, "date", "%NEG__mean_MA7", color="coalicion_elected")

In [None]:
merged_df['week_start'] = merged_df['datetime'].dt.to_period('W').apply(lambda r: r.start_time)

plot_df = merged_df.groupby(["week_start","elected", "corrected_output", "district_percentage", "coalicion", "user__id_str"]).agg({"id_str":"count", "retweet_count": "sum"}).reset_index()
piv_df = plot_df.pivot(index=['week_start',"elected", 'district_percentage', 'coalicion', "user__id_str"], columns='corrected_output', values=['id_str', "retweet_count"])
piv_df.columns = [f"{a}__{b}" for a, b in piv_df.columns]
# piv_df["%POS"] = piv_df["id_str__POS"] / (piv_df["id_StrPOS"] + piv_df["NEG"])
piv_df["%NEG_count"] = piv_df["id_str__NEG"] / (piv_df["id_str__POS"] + piv_df["id_str__NEG"])
piv_df["%NEG_rt"] = piv_df["retweet_count__NEG"] / (piv_df["retweet_count__POS"] + piv_df["retweet_count__NEG"])
# # 
df3 = piv_df.reset_index().fillna(0)

df3
# fig = px.scatter(df3, "%NEG", "votes", animation_frame="date")
# fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 600
# fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 600

df3["date_str"] = df3["week_start"].astype("str")
df3["elected"] = df3["elected"].astype("str")
px.scatter(df3, "%NEG_count", "%NEG_rt", animation_frame="date_str", color="coalicion", range_x=[0, 1], range_y=[0,1])

In [1]:
px.line(piv_df, "date", "NEG/POS", color="elected")

NameError: name 'px' is not defined

In [None]:
for coalicion in plot_df["coalicion"].unique():
  fig = px.area(plot_df[(plot_df["coalicion"]==coalicion)], x="date", y="id_str", color="output", groupnorm="percent", title=coalicion)
  fig.show()

In [None]:
for coalicion in plot_df["coalicion"].unique():
  fig = px.area(plot_df[(plot_df["coalicion"]==coalicion)], x="date", y="id_str", color="corrected_output", groupnorm="percent", title=coalicion)
  fig.show()

In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaForCausalLM, XLMRobertaConfig, AutoTokenizer, AutoConfig
import torch

tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-xlm-roberta-base', use_fast=True)
config = AutoConfig.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")
config.is_decoder = True
model = XLMRobertaForCausalLM.from_pretrained('cardiffnlp/twitter-xlm-roberta-base', config=config)

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

prediction_logits = outputs.logits

In [None]:
prediction_logits.shape

torch.Size([1, 8, 250002])

In [None]:
from torch.nn import functional as F

probabilities = F.softmax(prediction_logits, dim=-1)
probabilities.shape

torch.Size([1, 8, 250002])

In [None]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 19.1 MB/s eta 0:00:01[K     |▌                               | 20 kB 26.5 MB/s eta 0:00:01[K     |▉                               | 30 kB 28.4 MB/s eta 0:00:01[K     |█                               | 40 kB 18.9 MB/s eta 0:00:01[K     |█▍                              | 51 kB 9.5 MB/s eta 0:00:01[K     |█▋                              | 61 kB 10.1 MB/s eta 0:00:01[K     |██                              | 71 kB 9.5 MB/s eta 0:00:01[K     |██▏                             | 81 kB 10.5 MB/s eta 0:00:01[K     |██▍                             | 92 kB 10.6 MB/s eta 0:00:01[K     |██▊                             | 102 kB 8.7 MB/s eta 0:00:01[K     |███                             | 112 kB 8.7 MB/s eta 0:00:01[K     |███▎                            | 122 kB 8.7 MB/s eta 0:00:01[K     |███▌   

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.5 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 34.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 43.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 26.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.12.1-py3-none-any.whl (270 kB)
[?25l[K     |█▏                              | 10 kB 30.2 MB/s eta 0:00:01[K     |██▍                             | 20 kB 26.2 MB/s eta 0:00:01[K     |███▋                            | 30 kB 11.5 MB/s eta 0:00:01[K     |████▉                           | 40 kB 9.3 MB/s eta 0:00:01[K     |██████                          | 51 kB 5.4 MB/s eta 0:00:01[K     |███████▎                        | 61 kB 5.9 MB/s eta 0:00:01[K     |████████▌                       | 71 kB 5.6 MB/s eta 0:00:01[K     |█████████▊                      | 81 kB 6.4 MB/s eta 0:00:01[K     |███████████                     | 92 kB 4.9 MB/s eta 0:00:01[K     |████████████▏                   | 102 kB 5.2 MB/s eta 0:00:01[K     |█████████████▍                  | 112 kB 5.2 MB/s eta 0:00:01[K     |██████████████▋                 | 122 kB 5.2 MB/s eta 0:00:01[K     |███████████████▊                | 133 kB 5.2 MB/s eta 0:00:01

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer
import numpy as np
from datasets import load_metric
from transformers import TrainingArguments
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained("finiteautomata/beto-sentiment-analysis")

model = AutoModelForSequenceClassification.from_pretrained("finiteautomata/beto-sentiment-analysis", num_labels=5, ignore_mismatched_sizes=True)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at finiteautomata/beto-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
data = merged_df[merged_df["corrected_output"].isin(["POS", "NEG"]) & (~merged_df["coalicion"].isin(["Otro"]))][["full_text__preprocess", "coalicion"]].rename(columns={"full_text__preprocess":"text", "coalicion":"labels"})

In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(data, test_size=0.3)
dataset_train = Dataset.from_pandas(df_train).map(tokenize_function, batched=True).shuffle(seed=42).select(range(1000))

dataset_test = Dataset.from_pandas(df_test).map(tokenize_function, batched=True).shuffle(seed=42).select(range(1000))

dataset_train.labels = dataset_train["labels"]
dataset_test.labels = dataset_test["labels"]

  0%|          | 0/27 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

In [None]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)

# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
# full_train_dataset = tokenized_datasets["train"]
# full_eval_dataset = tokenized_datasets["test"]

training_args = TrainingArguments("test_trainer", evaluation_strategy="epoch")
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=dataset_train,
  eval_dataset=dataset_test,
  compute_metrics=compute_metrics,
)
metrics = trainer.evaluate()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


RuntimeError: ignored

In [None]:
print(metrics)

{'eval_runtime': 71.2748, 'eval_samples_per_second': 14.03, 'eval_steps_per_second': 1.754}


In [None]:
t = trainer.train()


The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__.
***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 375


RuntimeError: ignored