In [10]:
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re

# Set the working directory
os.chdir("/Users/aishu/Library/CloudStorage/OneDrive-UniversityofNorthFlorida/Lakshmi's Research/Fall 2023/Datasets/bitcoin")

# List all files in the current directory
file_names = os.listdir()

# Filter files with "_reddit" in their names
reddit_files = [file for file in file_names if "_reddit" in file]

# Initialize an empty list to store DataFrames
data_frames = []

# Print the list of Reddit files for confirmation
print("Reddit Files:")
for reddit_file in reddit_files:
    print(reddit_file)

# Read each Reddit file into a DataFrame and append to the list
for reddit_file in reddit_files:
    df = pd.read_csv(reddit_file)  # Assuming the files are in CSV format
    data_frames.append(df)

# Merge the separate DataFrames into one
reddit_data = pd.concat(data_frames, ignore_index=True)

# Extract relevant columns
columns_to_keep = ['subreddit', 'selftext', 'created', 'author']
filtered_df = reddit_data[columns_to_keep]

# Convert 'created' column to datetime
filtered_df['created'] = pd.to_datetime(filtered_df['created'], unit='s')

# Remove rows with 'selftext' containing 'removed' and NaN values
filtered_df = filtered_df.dropna(subset=['selftext'])
filtered_df = filtered_df[~filtered_df['selftext'].str.contains('removed', case=False, na=False)]
filtered_df = filtered_df[~filtered_df['selftext'].str.contains('deleted', case=False, na=False)]
filtered_df = filtered_df[~filtered_df['selftext'].str.contains('thread', case=False, na=False)]

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function for text processing
def process_text(text):
    if isinstance(text, str):  # Check for NaN values
        # Remove special characters using regular expressions
        text = re.sub(r'[^A-Za-z0-9]+', ' ', text)
        # Tokenize the text
        words = word_tokenize(text)
        # Remove stop words and lemmatize the remaining words
        filtered_words = [lemmatizer.lemmatize(word) for word in words if word.lower() not in stop_words]
        # Reconstruct the text from the processed words
        processed_text = ' '.join(filtered_words)
        return processed_text
    else:
        return ''

# Apply text processing to the 'selftext' column
filtered_df['processed_text'] = filtered_df['selftext'].apply(process_text)

Reddit Files:
bitcoinmarkets_reddit.csv
btc_reddit.csv
bitcoinbeginners_reddit.csv
bitcoin_reddit.csv
bitcoin_mining_reddit.csv
bitcoincash_reddit.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['created'] = pd.to_datetime(filtered_df['created'], unit='s')
[nltk_data] Downloading package punkt to /Users/aishu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/aishu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/aishu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the lemmatizer and get a set of stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to process text
def process_text(text):
    # Remove special characters using regular expressions
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)
    
    # Tokenize the text
    words = word_tokenize(text)

    # Remove stop words and lemmatize the remaining words
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word.lower() not in stop_words]

    # Reconstruct the text from the processed words
    processed_text = ' '.join(filtered_words)

    return processed_text

# Apply text processing to the 'text' column
filtered_df['processed_text'] = filtered_df['selftext'].apply(process_text)

# Function to get sentiment using VADER
def get_sentiment_vader(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']

# Function to get sentiment using TextBlob
def get_sentiment_textblob(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

# Apply sentiment analysis using VADER
filtered_df['vader_sentiment'] = filtered_df['processed_text'].apply(get_sentiment_vader)

# Apply sentiment analysis using TextBlob
filtered_df['textblob_sentiment'] = filtered_df['processed_text'].apply(get_sentiment_textblob)

# Classify sentiment based on compound VADER score
def classify_sentiment_vader(compound):
    if compound >= 0.05:
        return 'Positive'
    elif compound <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Classify sentiment based on TextBlob score
def classify_sentiment_textblob(score):
    if score >= 0.1:
        return 'Positive'
    elif score <= -0.1:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment classification
filtered_df['vader_sentiment_class'] = filtered_df['vader_sentiment'].apply(classify_sentiment_vader)
filtered_df['textblob_sentiment_class'] = filtered_df['textblob_sentiment'].apply(classify_sentiment_textblob)

[nltk_data] Downloading package punkt to /Users/aishu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/aishu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/aishu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
# Save the updated DataFrame to a new CSV file
filtered_df.to_csv('BTC_final_reddit_sentiments_combined.csv', index=False)
