<a href="https://colab.research.google.com/github/koushik980/NLP/blob/main/NLP_F_25_09_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ---------- Robust data loading (replace previous load section) ----------
import os
import pandas as pd
import numpy as np

# set this to your preferred path or leave as-is
DATA_PATH = '/path/to/your/sentiment_analysis_data.csv'  # edit if you have a custom path
FALLBACK_NAME = 'tweets.csv'  # common fallback
AUTO_CREATE_SAMPLE = True     # if True and no file found, generate a synthetic sample

def create_synthetic_tweets_csv(path='tweets.csv', n=1000, seed=42):
    """Create a balanced synthetic tweets.csv with n samples (n should be even)."""
    np.random.seed(seed)
    n = int(n // 2) * 2  # ensure even
    pos_templates = [
        "I love this! {}",
        "Amazing product {}, would buy again!",
        "This made my day {}",
        "So happy with the results {}",
        "Absolutely fantastic experience {}",
        "Great job, very satisfied {}",
        "Highly recommend this {}",
        "Five stars! {}",
        "So good, impressed {}",
        "Loved every bit of it {}"
    ]
    neg_templates = [
        "Terrible experience {}, never again.",
        "Very disappointed {}",
        "Worst purchase ever {}",
        "This ruined my day {}",
        "Completely useless {}, do not buy.",
        "Awful service {}, not recommended.",
        "Regret buying this {}",
        "One star {}, poor quality.",
        "Frustrating and buggy {}",
        "Not worth the money {}"
    ]
    rows = []
    for i in range(n // 2):
        rows.append({"tweet": pos_templates[i % len(pos_templates)].format(""), "label": 1})
        rows.append({"tweet": neg_templates[i % len(neg_templates)].format(""), "label": 0})
    df_sample = pd.DataFrame(rows)
    df_sample = df_sample.sample(frac=1, random_state=seed).reset_index(drop=True)
    df_sample.to_csv(path, index=False)
    return df_sample

# Try to load dataset with helpful fallback and clear error messages
df = None

if os.path.exists(DATA_PATH):
    print(f"✔ Found dataset at DATA_PATH: {DATA_PATH}")
    df = pd.read_csv(DATA_PATH)
elif os.path.exists(FALLBACK_NAME):
    print(f"✔ Found dataset at fallback path: {FALLBACK_NAME}")
    DATA_PATH = FALLBACK_NAME
    df = pd.read_csv(FALLBACK_NAME)
else:
    # If running in Google Colab, allow upload
    try:
        from google.colab import files
        print("No dataset found at the configured paths.")
        print("If you'd like to upload a file from your computer, run the code below in a new cell:")
        print("from google.colab import files\nuploaded = files.upload()\n# then set DATA_PATH to the uploaded filename")
    except Exception:
        # not Colab or files.upload not available - ignore
        pass

    if AUTO_CREATE_SAMPLE:
        print("No dataset found. Auto-creating a synthetic 'tweets.csv' for testing (balanced, 1000 rows).")
        df = create_synthetic_tweets_csv(path=FALLBACK_NAME, n=1000, seed=42)
        DATA_PATH = FALLBACK_NAME
        print(f"Synthetic dataset saved to: {FALLBACK_NAME}")
    else:
        raise FileNotFoundError(
            f"Data file not found at '{DATA_PATH}' or '{FALLBACK_NAME}'.\n"
            "Either upload a CSV file (use files.upload() in Colab), or set DATA_PATH to the correct path."
        )

# Validate required columns
REQUIRED_COLS = ['tweet', 'label']
for col in REQUIRED_COLS:
    if col not in df.columns:
        raise ValueError(f"Dataset missing required column '{col}'. Found columns: {list(df.columns)}")

print(f"Loaded {len(df)} rows from {DATA_PATH}")
# optional: show head
print(df.head())

# Now continue with your pipeline; e.g.:
# df['clean'] = df['tweet'].astype(str).apply(clean_text)

No dataset found at the configured paths.
If you'd like to upload a file from your computer, run the code below in a new cell:
from google.colab import files
uploaded = files.upload()
# then set DATA_PATH to the uploaded filename
No dataset found. Auto-creating a synthetic 'tweets.csv' for testing (balanced, 1000 rows).
Synthetic dataset saved to: tweets.csv
Loaded 1000 rows from tweets.csv
                                tweet  label
0  Terrible experience , never again.      0
1              Frustrating and buggy       0
2                       I love this!       1
3                       I love this!       1
4    Awful service , not recommended.      0
