In [None]:
!pip install transformers




In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, TFDistilBertModel
import tensorflow as tf
from sklearn.metrics import classification_report

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Sentiment Analysis/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Sentiment Analysis/test.csv')

In [None]:
nltk.download('stopwords')
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = " ".join([word for word in text.split() if word not in stopwords.words('english')])
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
train_df['CleanedText'] = train_df['text'].apply(clean_text)

In [None]:
polarity_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
train_df['PolarityLabel'] = train_df['polarity'].map(polarity_mapping)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_df['CleanedText'], train_df['PolarityLabel'], test_size=0.2, random_state=42)

In [None]:
max_length = 128
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
X_train_encoded = tokenizer(list(X_train), padding=True, truncation=True, max_length=max_length, return_tensors='tf').data
X_val_encoded = tokenizer(list(X_val), padding=True, truncation=True, max_length=max_length, return_tensors='tf').data

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
class SentimentModel(tf.keras.Model):
    def __init__(self):
        super(SentimentModel, self).__init__()
        self.distilbert = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dense = tf.keras.layers.Dense(3, activation='softmax')

    def call(self, inputs):
        output = self.distilbert(inputs)
        pooled_output = output[0][:, 0, :]
        return self.dense(pooled_output)

In [None]:
model = SentimentModel()
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
model.fit(X_train_encoded, y_train, epochs=5, batch_size=16, validation_data=(X_val_encoded, y_val), callbacks=[early_stop])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


<keras.src.callbacks.History at 0x79494d7756c0>

In [None]:
test_df['CleanedText'] = test_df['text'].apply(clean_text)
test_encoded = tokenizer(list(test_df['CleanedText']), padding=True, truncation=True, max_length=max_length, return_tensors='tf').data

In [None]:
predictions = model.predict(test_encoded)
predicted_classes = np.argmax(predictions, axis=1)



In [None]:
submission = pd.DataFrame({'Id': test_df['id'], 'Polarity': predicted_classes})
submission.to_csv('/content/drive/MyDrive/Sentiment Analysis/submission_distilbert.csv', index=False)

In [None]:
error_analysis_data = X_val.sample(n=80, random_state=1)

In [None]:
error_analysis_encoded = tokenizer(list(error_analysis_data), padding=True, truncation=True, max_length=max_length, return_tensors='tf').data
error_analysis_predictions = model.predict(error_analysis_encoded)



In [None]:
error_analysis_predicted_classes = np.argmax(error_analysis_predictions, axis=1)
error_analysis_actual_classes = y_val.loc[error_analysis_data.index]

In [None]:
for predicted_class, actual_class, text in zip(error_analysis_predicted_classes, error_analysis_actual_classes, error_analysis_data):
    if predicted_class != actual_class:
        print(f"Text: {text}\nPredicted: {predicted_class}\nActual: {actual_class}\n")

Predicted: 0
Actual: 1

Text: excellent resource locale data website download xml version database includes datetime formats number formats lots locale specific data
Predicted: 1
Actual: 2

Text: rendering issue fixed ie need worry
Predicted: 0
Actual: 2

Text: jonhanna invoke hoares dictum knuths paper clarity optimization quote never actually appears paper rather wrote previous discussion concluded premature emphasis efficiency big mistake may well source programming complexity grief
Predicted: 0
Actual: 1

Text: use symlinks instead alias sad ui go terminal type first path base path original file second base path symlink filefolder etc
Predicted: 0
Actual: 1

Text: wow guys reply quickly im hard time following
Predicted: 0
Actual: 2

Text: check though personally recommend backbone agile fast learn leaves lot freedom happiness imo special thing backbone compared others community fancy feature replace
Predicted: 1
Actual: 2

Text: came across idiom opensource python choked drink rath