In [1]:
!pip install sacremoses
!pip install flair

Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895239 sha256=0c39be979e1cb7c4f03573157496e6f1b9fdc2a8ef55474c3bacee43bc10f8f1
  Stored in directory: /root/.cache/pip/wheels/00/24/97/a2ea5324f36bc626e1ea0267f33db6aa80d157ee977e9e42fb
Successfully built sacremoses
Installing collected packages: sacremoses
Successfully installed sacremoses-0.0.53
Collecting flair
  Downloading flair-0.12.2-py3-none-any.whl (373 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m373.1/373.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting segtok>=1.5.7 (from flair)
  Downloading segtok-1.5.1

In [2]:
import nltk

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
import pandas as pd
import torch
from flair.models import TextClassifier
from flair.data import Sentence
from transformers import (FlaubertTokenizer, FlaubertForSequenceClassification)
from nltk.sentiment import SentimentIntensityAnalyzer

# Load the pre-trained French sentiment analysis models
flair_classifier = TextClassifier.load('sentiment-fast')
flaubert_model_name = 'flaubert/flaubert_base_cased'
flaubert_base_uncased_model_name = 'flaubert/flaubert_base_uncased'
flaubert_tokenizer = FlaubertTokenizer.from_pretrained(flaubert_model_name)
flaubert_base_uncased_tokenizer = FlaubertTokenizer.from_pretrained(flaubert_base_uncased_model_name)
flaubert_model = FlaubertForSequenceClassification.from_pretrained(flaubert_model_name)
flaubert_base_uncased_model = FlaubertForSequenceClassification.from_pretrained(flaubert_base_uncased_model_name)

# Load the dataset containing the "commentaire" column
data = pd.read_csv('/content/data_cleann.csv')

# Drop rows with missing values in the "commentaire" column
data = data.dropna(subset=['commentaire'])

# Initialize the SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Create an empty list to store the ensemble predicted sentiments
ensemble_predicted_sentiments = []

# Iterate through each comment in the 'commentaire' column
for french_text in data['commentaire']:
    # Create Sentence objects with the French text for all models
    sentence_flair = Sentence(french_text)
    sentence_flaubert = Sentence(french_text)
    sentence_flaubert_base_uncased = Sentence(french_text)

    # Perform sentiment analysis using Flair model
    flair_classifier.predict(sentence_flair)

    # Perform sentiment analysis using Flaubert and Flaubert Base Uncased models
    inputs_flaubert = flaubert_tokenizer(french_text, return_tensors='pt', truncation=True, max_length=128)
    with torch.no_grad():
        outputs_flaubert = flaubert_model(**inputs_flaubert)
    predicted_label_flaubert = 'positive' if outputs_flaubert.logits[0][0] > 0 else 'negative'

    inputs_flaubert_base_uncased = flaubert_base_uncased_tokenizer(french_text, return_tensors='pt', truncation=True, max_length=128)
    with torch.no_grad():
        outputs_flaubert_base_uncased = flaubert_base_uncased_model(**inputs_flaubert_base_uncased)
    predicted_label_flaubert_base_uncased = 'positive' if outputs_flaubert_base_uncased.logits[0][0] > 0 else 'negative'

    # Perform sentiment analysis using SentimentIntensityAnalyzer
    sentiment_score = sia.polarity_scores(french_text)['compound']
    predicted_label_sia = 'positive' if sentiment_score > 0.1 else 'negative'

    # Combine the predicted labels using majority voting
    predicted_labels = [
        sentence_flair.labels[0].value,
        predicted_label_flaubert,
        predicted_label_flaubert_base_uncased,
        predicted_label_sia
    ]
    ensemble_predicted_label = max(set(predicted_labels), key=predicted_labels.count)

    # Append the ensemble predicted label to the list
    ensemble_predicted_sentiments.append(ensemble_predicted_label)

# Add the ensemble predicted sentiments as a new column in the DataFrame
data['ensemble_predicted_sentiment'] = ensemble_predicted_sentiments

# Print the DataFrame with the ensemble sentiment classification
print(data[['commentaire', 'ensemble_predicted_sentiment']])

2023-07-31 11:00:26,259 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-fasttext-rnn/sentiment-en-mix-ft-rnn_v8.pt not found in cache, downloading to /tmp/tmpeu1hzid_


100%|██████████| 1.16G/1.16G [01:19<00:00, 15.6MB/s]

2023-07-31 11:01:46,740 copying /tmp/tmpeu1hzid_ to cache at /root/.flair/models/sentiment-en-mix-ft-rnn_v8.pt





2023-07-31 11:01:51,055 removing temp file /tmp/tmpeu1hzid_


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.56M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/896k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.56M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/917k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/553M [00:00<?, ?B/s]

Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading pytorch_model.bin:   0%|          | 0.00/550M [00:00<?, ?B/s]

Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_uncased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
