In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta


In [2]:
# Set seed for reproducibility
np.random.seed(42)
random.seed(42)

In [3]:
# Fake comments templates (grouped by pseudo-topics)
templates = {
    'quality_positive': [
        "The product quality is excellent, far better than expected. Highly recommend for daily use.",
        "Amazing build quality! It feels premium and works flawlessly.",
        "Great quality materials. No complaints here, worth every penny."
    ],
    'delivery_issue': [
        "Delivery was late by a week, very disappointing. Product is fine but service needs improvement.",
        "Package arrived damaged due to poor shipping. Had to return it.",
        "Late delivery again. This is the second time with your company."
    ],
    'service_complaint': [
        "Customer service was unhelpful and rude. Won't buy again.",
        "Took forever to get a response from support. Frustrating experience.",
        "Billing error not resolved quickly. Poor service overall."
    ],
    'recommendation': [
        "Love it! Would recommend to friends and family.",
        "Best purchase this year. Fast and easy.",
        "Outstanding value. Five stars all the way."
    ]
}

In [5]:
# Generate 100 comments
comments = []
ratings = []
dates = []
ids = list(range(1, 101))

for i in range(100):
    # Random topic
    topic = random.choice(list(templates.keys()))
    template = random.choice(templates[topic])
    # Add variation: typos, length
    if random.random() < 0.1:  # 10% NaN
        comment = np.nan
        rating = random.randint(1, 3)  # Low for issues
    else:
        # Minor variation
        comment = template + f" (Test {i+1})"
        if random.random() < 0.2:  # 20% longer
            comment += " Plus, the color options are fantastic."
        rating = 5 if 'positive' in topic or 'recommendation' in topic else random.randint(1, 3)
    
    comments.append(comment)
    ratings.append(rating)
    
    # Random dates (last year)
    start_date = datetime(2024, 1, 1)
    random_days = random.randint(0, 365)
    date = (start_date + timedelta(days=random_days)).strftime('%Y-%m-%d')
    dates.append(date)

# Create DataFrame
df = pd.DataFrame({
    'id': ids,
    'date': dates,
    'rating': ratings,
    'comments': comments
})

# Save to CSV
df.to_csv('data\data.csv', index=False)
print("Fake dataset 'data.csv' created with 100 rows.")
print(df.head())  # Preview
print(f"Sample topics distribution: {pd.Series([k for c in comments if pd.notna(c) for k in templates if any(t in c.lower() for t in templates[k])]).value_counts()}")

Fake dataset 'data.csv' created with 100 rows.
   id        date  rating                                           comments
0   1  2024-06-05       5  Great quality materials. No complaints here, w...
1   2  2024-09-14       5  Amazing build quality! It feels premium and wo...
2   3  2024-07-20       3                                                NaN
3   4  2024-08-09       5  Outstanding value. Five stars all the way. (Te...
4   5  2024-10-19       2  Took forever to get a response from support. F...
Sample topics distribution: Series([], Name: count, dtype: int64)


  df.to_csv('data\data.csv', index=False)


In [6]:
len(df)

100