In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/QuotaClimat/frugalaichallenge-text-train/" + splits["train"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
df['quote']

Unnamed: 0,quote
0,Interesting to note that Oklahoma minimum temp...
1,REPORT ON GEO ENGINEERING A.k.a. man-made FAKE...
2,"To realize, yes, climate change is a real prob..."
3,The fluctuations in the carbon dioxide concent...
4,I know the comments that were made after that ...
...,...
4867,The morally superior choice is for people to b...
4868,The climate is changing. No one is denying cli...
4869,It will be up to 15 years before it is possibl...
4870,"Global warming is real, but a problem, not the..."


In [7]:
df2 = df[['quote', 'label']]
df2

Unnamed: 0,quote,label
0,Interesting to note that Oklahoma minimum temp...,0_not_relevant
1,REPORT ON GEO ENGINEERING A.k.a. man-made FAKE...,6_proponents_biased
2,"To realize, yes, climate change is a real prob...",3_not_bad
3,The fluctuations in the carbon dioxide concent...,2_not_human
4,I know the comments that were made after that ...,0_not_relevant
...,...,...
4867,The morally superior choice is for people to b...,7_fossil_fuels_needed
4868,The climate is changing. No one is denying cli...,1_not_happening
4869,It will be up to 15 years before it is possibl...,0_not_relevant
4870,"Global warming is real, but a problem, not the...",4_solutions_harmful_unnecessary


In [8]:
def lowering(sentence):
    return sentence.lower()

In [9]:
df['processed_quote'] = df['quote'].apply(lowering)
df['processed_quote']

Unnamed: 0,processed_quote
0,interesting to note that oklahoma minimum temp...
1,report on geo engineering a.k.a. man-made fake...
2,"to realize, yes, climate change is a real prob..."
3,the fluctuations in the carbon dioxide concent...
4,i know the comments that were made after that ...
...,...
4867,the morally superior choice is for people to b...
4868,the climate is changing. no one is denying cli...
4869,it will be up to 15 years before it is possibl...
4870,"global warming is real, but a problem, not the..."


In [10]:
count_vectorizer = CountVectorizer()
X_train = count_vectorizer.fit_transform(df['processed_quote'])


# Training pipeline

In [71]:
import numpy as np

from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, make_scorer


In [28]:
# Feature/Target
X = df["quote"]
y = df["label"]

In [81]:
# Pipeline vectorizer + Naive Bayes
pipeline_naive_bayes = make_pipeline(
    CountVectorizer(),
    MultinomialNB()
)

In [79]:
# Cross-validation with average F1 score of all label predicted per fold
cv_results = cross_validate(
    pipeline_naive_bayes, X, y, cv=5,
    scoring="f1_macro"
)
cv_results

{'fit_time': array([0.22471046, 0.22895551, 0.24040222, 0.21953368, 0.20870471]),
 'score_time': array([0.05616379, 0.04852867, 0.05327678, 0.05132031, 0.05161834]),
 'test_score': array([0.51103228, 0.47192073, 0.51317179, 0.50646933, 0.48026989])}

In [80]:
# Average F1 score of all 'test_score' column in cv_results, rounded by 2
average_f1 = cv_results["test_score"].mean()
np.round(average_f1,2)



0.5