In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

# Load the sentiment analysis pipeline
print("Loading sentiment analysis model...")
sentiment_pipeline = pipeline("text-classification", model="finiteautomata/bertweet-base-sentiment-analysis")

# Function to get sentiment label
def get_sentiment(text):
    try:
        if pd.isna(text) or str(text).strip() == "":
            return "N"  # Neutral as default for empty text

        result = sentiment_pipeline(text[:512])  # Truncate to model's max length
        label = result[0]['label']

        # Convert to your desired format (P/N)
        if label == 'POS':
            return "P"
        elif label == 'NEG':
            return "N"
        else:  # NEU
            return "M"  # Or you might want to use another code for neutral
    except Exception as e:
        print(f"Error processing text: {text}. Error: {str(e)}")
        return "E"

# Load your dataset
print("Loading dataset...")
dataset_path = r"/content/drive/MyDrive/Dataset.csv"
df = pd.read_csv(dataset_path, encoding='utf-8-sig')
print(f"Dataset loaded with {len(df)} rows.")
# Process English sentences and add sentiment labels
print("Analyzing sentiment for English sentences...")
tqdm.pandas()  # Enable progress bar for pandas apply
df['Sentiment(P/N)'] = df['English_Sentence_text'].progress_apply(get_sentiment)

# Save the labeled dataset
output_path = r"/content/drive/MyDrive/Final_Labeled_Dataset.csv"
df.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f"\n✅ Sentiment analysis complete! Labeled dataset saved to: {output_path}")

Loading sentiment analysis model...


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Device set to use cuda:0


Loading dataset...
Dataset loaded with 237019 rows.
Analyzing sentiment for English sentences...


  0%|          | 11/237019 [00:00<3:10:42, 20.71it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 237019/237019 [35:36<00:00, 110.95it/s]



✅ Sentiment analysis complete! Labeled dataset saved to: /content/drive/MyDrive/Final_Labeled_Dataset.csv


In [None]:
import pandas as pd
import re

# Load dataset
input_path = r"/content/drive/MyDrive/Final_Labeled_Dataset.csv"
df = pd.read_csv(input_path, encoding='utf-8-sig')
print("Original dataset shape:", df.shape)

# 1. Remove neutral sentiment
df_filtered = df[df['Sentiment(P/N)'] != 'M'].copy()
print("After removing neutral:", df_filtered.shape)

# 2. Remove short sentences (<5 words)
def count_urdu_words(sentence):
    if pd.isna(sentence):
        return 0
    words = re.sub(r'[۔،؛؟!\s]+', ' ', str(sentence)).strip().split()
    return len(words)

df_filtered['word_count'] = df_filtered['Sentence_text'].apply(count_urdu_words)
df_filtered = df_filtered[df_filtered['word_count'] >= 5].drop(columns=['word_count'])
print("After removing short sentences:", df_filtered.shape)

# 3. Balance sentiments (max 20k each)
MAX_SAMPLES = 20000

# Split by sentiment
pos = df_filtered[df_filtered['Sentiment(P/N)'] == 'P']
neg = df_filtered[df_filtered['Sentiment(P/N)'] == 'N']

# Limit to 20k each (or available if less)
pos_limited = pos.head(min(len(pos), MAX_SAMPLES))
neg_limited = neg.head(min(len(neg), MAX_SAMPLES))

# Get extra negatives beyond 20k
neg_extra = neg.iloc[MAX_SAMPLES:] if len(neg) > MAX_SAMPLES else pd.DataFrame()

# Combine balanced dataset
df_balanced = pd.concat([pos_limited, neg_limited]).sample(frac=1).reset_index(drop=True)

# 4. Save results
# Balanced dataset
balanced_path = r"/content/drive/MyDrive/Ultra_Final_Labeled_Dataset.csv"
df_balanced.to_csv(balanced_path, index=False, encoding='utf-8-sig')

# Extra negatives (if any)
if len(neg_extra) > 0:
    extra_path = r"/content/drive/MyDrive/Negative_Final_Labeled_Dataset.csv"
    neg_extra.to_csv(extra_path, index=False, encoding='utf-8-sig')

# 5. Print statistics
print("\nFinal Counts:")
print(f"Positive: {len(pos_limited)}")
print(f"Negative: {len(neg_limited)}")
if len(neg_extra) > 0:
    print(f"Extra negatives saved: {len(neg_extra)}")

print(f"\n✅ Balanced dataset saved to: {balanced_path}")
if len(neg_extra) > 0:
    print(f"✅ Extra negatives saved to: {extra_path}")

print("\nSample of balanced data:")
print(df_balanced[['Sentence_text', 'Sentiment(P/N)']].head())

Original dataset shape: (38258, 8)
After removing neutral: (38258, 8)
After removing short sentences: (38258, 8)

Final Counts:
Positive: 18258
Negative: 20000

✅ Balanced dataset saved to: /content/drive/MyDrive/Ultra_Final_Labeled_Dataset.csv

Sample of balanced data:
                             Sentence_text Sentiment(P/N)
0       لیکن کم از کم وہ ہم سے مل سکتا ہے۔              P
1          تو میں وکیل کے پاس نہیں جاسکتا۔              N
2  آپ کا خوبصورت نمو آپ سے کیا کہہ رہا ہے؟              P
3   آپ نے جو کچھ بھی کیا ، یہ بہت غلط تھا۔              N
4   میں نے سوچا کہ آپ کسی پریشانی میں ہیں۔              N
