# Data Collection for Sentiment Analysis

This notebook is dedicated to collecting raw social media data for sentiment analysis. We will utilize APIs or web scraping techniques to gather the data.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

import os

# Define absolute path for data directory
directory = r"C:\Users\ankit\Desktop\LORU\sentiment-analysis-app\data\raw"


# Define absolute path for data file

file_path = os.path.join(directory, "social_media_data.csv")

# Ensure the directory exists
os.makedirs(directory, exist_ok=True)

def save_to_csv(df, file_path):
    if not df.empty:  # Ensure there is data to save
        df.to_csv(file_path, index=False, mode="a", header=not os.path.exists(file_path))
        print(f"Appended {len(df)} rows to {file_path}")
    else:
        print("No data to append.")


In [2]:
# Function to scrape posts from a subreddit
def scrape_reddit(subreddit, max_posts=500):
    base_url = f"https://www.reddit.com/r/{subreddit}/new/.rss"
    headers = {"User-Agent": "Mozilla/5.0"}
    
    posts = []
    
    while len(posts) < max_posts:
        response = requests.get(base_url, headers=headers)
        if response.status_code != 200:
            print(f"Error: {response.status_code}")
            break
        
        soup = BeautifulSoup(response.content, "xml")
        entries = soup.find_all("entry")

        for entry in entries:
            if len(posts) >= max_posts:
                break
            posts.append({
                "title": entry.title.text,
                "author": entry.author.find("name").text if entry.author else "Unknown",
                "link": entry.link["href"],
                "published": entry.published.text,
                "content": entry.content.text if entry.content else ""
            })

        time.sleep(1)  # Avoid too many requests in a short time
    
    return pd.DataFrame(posts)

# List of subreddits to scrape

subreddits = [
            #     "politics",
            #    "technology",
            #      "science",
            #        "WutheringWaves",
            #          "Brawlstars",
                       "Happy",
                        #  "UpliftingNews",
                        #    "Depression",
                        #      "tifu",
                        #        "OffMyChest",
                        #          "TodayILearned"
                                 ]

# DataFrame to store all posts
all_posts = pd.DataFrame()

# Loop through each subreddit and scrape posts
for subreddit in subreddits:
    print(f"Scraping subreddit: {subreddit}")
    df = scrape_reddit(subreddit, 500)  # Ensure scrape_reddit returns a DataFrame
    save_to_csv(df, file_path)

print("Scraping and saving completed.")

# Display the first few rows of the combined DataFrame
print(all_posts.head())

Scraping subreddit: Happy
Appended 500 rows to C:\Users\ankit\Desktop\LORU\sentiment-analysis-app\data\raw\social_media_data.csv
Scraping and saving completed.
Empty DataFrame
Columns: []
Index: []


## Initial Data Exploration

In this section, we will perform some initial exploration of the collected data to understand its structure and contents.

In [3]:
# Display basic information about the DataFrame
if 'df' in locals():
    print(df.info())
    print(df.describe())
else:
    print('DataFrame not available for exploration.')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      500 non-null    object
 1   author     500 non-null    object
 2   link       500 non-null    object
 3   published  500 non-null    object
 4   content    500 non-null    object
dtypes: object(5)
memory usage: 19.7+ KB
None
                                                    title     author  \
count                                                 500        500   
unique                                                 25         21   
top     I got to go home recently and spend time with ...  /u/tacozy   
freq                                                   20         80   

                                                     link  \
count                                                 500   
unique                                                 25   
top     https://www.reddit.com/r/

## Save the Collected Data

Finally, we will save the collected data to the raw data directory for further processing.

In [4]:
# import os

# # Define absolute path for data directory
# directory = r"C:\Users\ankit\Desktop\HYperion\sentiment-analysis-app\data\raw"
# file_path = os.path.join(directory, "social_media_data.csv")

# # Ensure the directory exists
# os.makedirs(directory, exist_ok=True)

# # Save the DataFrame if it exists
# if "df" in locals():
#     df.to_csv(file_path, index=False, mode="w")
#     print(f"Data saved at: {file_path}")
# else:
#     print("No data to save.")


In [5]:

# def save_to_csv(df, file_path):
#     if not df.empty:  # Ensure there is data to save
#         df.to_csv(file_path, index=False, mode="a", header=not os.path.exists(file_path))
#         print(f"Appended {len(df)} rows to {file_path}")
#     else:
#         print("No data to append.")

In [6]:
df.shape

(500, 5)