In [1]:
import pandas as pd

# Load the dataset
file_path = "dataset_reddit-scraper-task_2025-03-28_19-30-24-130.csv"
df = pd.read_csv(file_path)

# Display basic info and first few rows
df.info(), df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 156 entries, body to username
dtypes: float64(20), object(136)
memory usage: 1.2+ MB


(None,
                                                 body categories/0  \
 0                                                NaN          new   
 1  I'm sure many people here would have seen the ...          NaN   
 2  I actually value Apple's approach to making ev...          NaN   
 3  There will never be decent AI on any of these ...          NaN   
 4  There are better and better , tinier and tinie...          NaN   
 
           categories/1 category communityName                 createdAt  \
 0  ?include_over_18=on      NaN           NaN  2008-01-25T03:43:06.000Z   
 1                  NaN      NaN       r/apple  2025-03-27T18:54:22.000Z   
 2                  NaN    apple       r/apple  2025-03-28T12:33:58.000Z   
 3                  NaN    apple       r/apple  2025-03-28T13:48:10.000Z   
 4                  NaN    apple       r/apple  2025-03-28T15:25:45.000Z   
 
     dataType                                        description displayName  \
 0  community  An unofficial comm

In [11]:
import pandas as pd

# Selecting relevant columns
columns_to_keep = [
    "body", "title", "communityName", "createdAt", "dataType", "upVotes", 
    "upVoteRatio", "url", "username"
]
#print(columns_to_keep[3])

df_cleaned = df[columns_to_keep].copy(deep=True)
total_upvotes = df_cleaned["upVotes"].sum()
print(f"Total upvotes: {total_upvotes}")



# Convert createdAt to datetime, checking for issues
df_cleaned["createdAt"] = pd.to_datetime(df_cleaned["createdAt"], errors="coerce")
print(f"Invalid dates converted to NaT: {df_cleaned['createdAt'].isna().sum()}")

# Convert upVotes and upVoteRatio to numeric, handling errors
df_cleaned["upVotes"] = pd.to_numeric(df_cleaned["upVotes"], errors="coerce").fillna(0).astype(int)
df_cleaned["upVoteRatio"] = pd.to_numeric(df_cleaned["upVoteRatio"], errors="coerce").fillna(0)

# Fill missing values in text columns with empty strings
text_columns = ["body", "title", "communityName", "username"]
for col in text_columns:
    df_cleaned[col] = df_cleaned[col].replace({"None": "", "nan": "", "null": ""}, regex=True).fillna("")
    
# Replace empty usernames with "Unknown"
df_cleaned["username"] = df_cleaned["username"].replace("", "Unknown")

# Display the cleaned dataset info and first few rows
df_cleaned.info(), df_cleaned.head()
df.head()

Total upvotes: 86779.0
Invalid dates converted to NaT: 0
<class 'pandas.core.frame.DataFrame'>
Index: 955 entries, 1 to 998
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   body           955 non-null    object             
 1   title          955 non-null    object             
 2   communityName  955 non-null    object             
 3   createdAt      955 non-null    datetime64[ns, UTC]
 4   dataType       955 non-null    object             
 5   upVotes        955 non-null    int64              
 6   upVoteRatio    955 non-null    float64            
 7   url            955 non-null    object             
 8   username       955 non-null    object             
dtypes: datetime64[ns, UTC](1), float64(1), int64(1), object(6)
memory usage: 74.6+ KB


Unnamed: 0,body,categories/0,categories/1,category,communityName,createdAt,dataType,description,displayName,flair,...,day_of_week,has_image,text_length,hour_sin,hour_cos,day_sin,day_cos,sentiment,log_upVotes,community_mean_upvotes
1,I'm sure many people here would have seen the ...,,,,r/apple,2025-03-27 18:54:22+00:00,post,,,Apple Intelligence,...,3,0,1156,-1.0,-1.83697e-16,0.433884,-0.900969,0.138446,0.0,90.868063
2,I actually value Apple's approach to making ev...,,,apple,r/apple,2025-03-28 12:33:58+00:00,comment,,,,...,4,0,572,1.224647e-16,-1.0,-0.433884,-0.900969,-0.032887,2.564949,90.868063
3,There will never be decent AI on any of these ...,,,apple,r/apple,2025-03-28 13:48:10+00:00,comment,,,,...,4,0,135,-0.258819,-0.9659258,-0.433884,-0.900969,0.083333,1.098612,90.868063
4,"There are better and better , tinier and tinie...",,,apple,r/apple,2025-03-28 15:25:45+00:00,comment,,,,...,4,0,542,-0.7071068,-0.7071068,-0.433884,-0.900969,0.141667,0.693147,90.868063
5,If it doesn't work properly then it's not wort...,,,apple,r/apple,2025-03-28 13:53:50+00:00,comment,,,,...,4,0,51,-0.258819,-0.9659258,-0.433884,-0.900969,-0.075,0.693147,90.868063


In [5]:
# Selecting relevant columns
columns_to_keep = [
    "body", "title", "communityName", "createdAt", "dataType", "upVotes", 
    "upVoteRatio", "url", "username"
]
df_cleaned = df[columns_to_keep].copy()

# Convert createdAt to datetime
df_cleaned["createdAt"] = pd.to_datetime(df_cleaned["createdAt"], errors="coerce")

df_cleaned["upVotes"] = pd.to_numeric(df_cleaned["upVotes"], errors="coerce").fillna(0).astype(int)
df_cleaned["upVoteRatio"] = pd.to_numeric(df_cleaned["upVoteRatio"], errors="coerce").fillna(0)

# Filling missing values in text columns with empty strings
df_cleaned["body"] = df_cleaned["body"].fillna("")
df_cleaned["title"] = df_cleaned["title"].fillna("")
df_cleaned["communityName"] = df_cleaned["communityName"].fillna("")
df_cleaned["username"] = df_cleaned["username"].fillna("Unknown")

# Display the cleaned dataset info and first few rows
df_cleaned.info(), df_cleaned.head()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   body           1000 non-null   object             
 1   title          1000 non-null   object             
 2   communityName  1000 non-null   object             
 3   createdAt      1000 non-null   datetime64[ns, UTC]
 4   dataType       1000 non-null   object             
 5   upVotes        1000 non-null   int64              
 6   upVoteRatio    1000 non-null   float64            
 7   url            1000 non-null   object             
 8   username       1000 non-null   object             
dtypes: datetime64[ns, UTC](1), float64(1), int64(1), object(6)
memory usage: 70.4+ KB


(None,
                                                 body  \
 0                                                      
 1  I'm sure many people here would have seen the ...   
 2  I actually value Apple's approach to making ev...   
 3  There will never be decent AI on any of these ...   
 4  There are better and better , tinier and tinie...   
 
                                                title communityName  \
 0                r/Apple: Unofficial Apple Community                 
 1  OpenAI's new image generation model is what Ge...       r/apple   
 2                                                          r/apple   
 3                                                          r/apple   
 4                                                          r/apple   
 
                   createdAt   dataType  upVotes  upVoteRatio  \
 0 2008-01-25 03:43:06+00:00  community        0         0.00   
 1 2025-03-27 18:54:22+00:00       post        0         0.34   
 2 2025-03-28 12:33:58+00:

In [7]:
import pandas as pd
import re

# Load dataset
file_path = "dataset_reddit-scraper-task_2025-03-28_19-30-24-130.csv"
df = pd.read_csv(file_path)

# Keep only relevant columns
columns_to_keep = ["body", "title", "communityName", "createdAt", "dataType", "upVotes", "upVoteRatio", "username"]
df = df[columns_to_keep]

# Convert createdAt to datetime format
df["createdAt"] = pd.to_datetime(df["createdAt"], errors="coerce")

# Convert numerical columns
df["upVotes"] = pd.to_numeric(df["upVotes"], errors="coerce").fillna(0).astype(int)
df["upVoteRatio"] = pd.to_numeric(df["upVoteRatio"], errors="coerce").fillna(0.0).astype(float)

# Fill missing text values with empty strings
text_columns = ["body", "title", "communityName", "username"]
df[text_columns] = df[text_columns].fillna("")

# Remove URLs from 'body' and 'title'
url_pattern = r"http[s]?://\S+"
df["body"] = df["body"].apply(lambda x: re.sub(url_pattern, "", x))
df["title"] = df["title"].apply(lambda x: re.sub(url_pattern, "", x))

# Save cleaned dataset
cleaned_file_path = "cleaned_reddit_dataset.csv"
df.to_csv(cleaned_file_path, index=False)

# Display sample rows
df.head()


Unnamed: 0,body,title,communityName,createdAt,dataType,upVotes,upVoteRatio,username
0,,r/Apple: Unofficial Apple Community,,2008-01-25 03:43:06+00:00,community,0,0.0,
1,I'm sure many people here would have seen the ...,OpenAI's new image generation model is what Ge...,r/apple,2025-03-27 18:54:22+00:00,post,0,0.34,krikrija
2,I actually value Apple's approach to making ev...,,r/apple,2025-03-28 12:33:58+00:00,comment,12,0.0,precipiceblades
3,There will never be decent AI on any of these ...,,r/apple,2025-03-28 13:48:10+00:00,comment,2,0.0,sherbert-stock
4,"There are better and better , tinier and tinie...",,r/apple,2025-03-28 15:25:45+00:00,comment,1,0.0,MrBread134


In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingRegressor
from textblob import TextBlob
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import spacy
from openai import OpenAI
import os
from dotenv import load_dotenv
from transformers import AutoModelForSequenceClassification, AutoTokenizer

load_dotenv()

# Initialize NLP models
nlp = spacy.load("en_core_web_sm")
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    revision="d7645e1"
)
api_key = os.getenv("YOUR_OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

# Load data
df = pd.read_csv("cleaned_reddit_dataset.csv")  # Replace with your API feedback data

# Preprocessing
df['body'] = df['body'].fillna('')
df['createdAt'] = pd.to_datetime(df['createdAt'], errors='coerce')

# === FEATURE ENGINEERING ===
# 1. Pricing Strategy
pricing_keywords = ["price", "cost", "tier", "expensive", "free"]
df["pricing_mentions"] = df["body"].apply(lambda x: 1 if any(word in x.lower() for word in pricing_keywords) else 0)
df["pricing_sentiment"] = df["body"].apply(lambda x: classifier(x, candidate_labels=["fair", "expensive", "competitive"])['labels'][0])

# 2. New Feature Demands
model = SentenceTransformer('paraphrase-MiniLM-L3-v2') 
feature_requests = df[df["body"].str.contains("feature request")]["body"].tolist()
df["feature_embedding"] = model.encode(df["body"].tolist()).tolist()  # For clustering
df["votes"] = np.random.randint(0, 100, size=len(df))  # Mock voting system

# 3. Existing Features Feedback
def tag_feedback(text):
    if re.search(r"documentation|guide", text, re.IGNORECASE):
        return "documentation"
    elif re.search(r"error|bug", text, re.IGNORECASE):
        return "bug"
    return "general"
df["feedback_type"] = df["body"].apply(tag_feedback)

# 4. Complaints Management
def classify_complaint(text):
    doc = nlp(text)
    if any(ent.label_ == "MONEY" for ent in doc.ents):
        return "billing"
    return "technical"
df["complaint_type"] = df["body"].apply(classify_complaint)

def suggest_solution(complaint_type):
    return "Check payment method" if complaint_type == "billing" else "Review API status page"
df["solution"] = df["complaint_type"].apply(suggest_solution)

# 5. Competition Monitoring
competitors = ["Competitor X", "Competitor Y"]
df["competitor_mention"] = df["body"].apply(lambda x: 1 if any(c in x for c in competitors) else 0)

# Cyclical encoding for time features
df['hour_sin'] = np.sin(2 * np.pi * df['createdAt'].dt.hour / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['createdAt'].dt.hour / 24)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(max_features=500), "body"),
        ("num", StandardScaler(), ["pricing_mentions", "votes", "hour_sin", "hour_cos"]),
        ("cat", OneHotEncoder(), ["pricing_sentiment", "feedback_type", "complaint_type"])
    ],
    remainder='drop'
)

# Model pipeline (predicting API adoption score instead of upvotes)
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", GradientBoostingRegressor(n_estimators=200))
])

# Train/test split
X = df[["body", "pricing_mentions", "votes", "hour_sin", "hour_cos", 
        "pricing_sentiment", "feedback_type", "complaint_type"]]
y = df["adoption_score"]  # Replace with your success metric
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train model
model.fit(X_train, y_train)

# === DEVREL/SALES WORKFLOWS ===
# Auto-generate battle cards
def generate_battle_card(competitor):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": f"SWOT analysis: {competitor} vs our API"}]
    )
    return response.choices[0].message.content

# Alert system
def check_complaints():
    if len(df[df["complaint_type"] == "technical"]) > 10:
        print("ALERT: 10+ technical complaints detected")  # Replace with Slack API

# Sales enablement
def generate_pricing_recommendation():
    if df["pricing_sentiment"].value_counts().idxmax() == "expensive":
        return "Recommend introducing a free tier"
    return "Current pricing is competitive"

# Example usage
print("Top Complaint Solutions:", df[["complaint_type", "solution"]].head())
print("Competitor Battle Card:", generate_battle_card("Competitor X"))
print("Pricing Recommendation:", generate_pricing_recommendation())

: 