In [22]:
import kagglehub
import os
import pandas as pd

# Download the latest version of the dataset
path = kagglehub.dataset_download("kazanova/sentiment140")

# Find the CSV file in the directory
file_path = None
for file in os.listdir(path):
    if file.endswith('.csv'):  # Looking for .csv files
        file_path = os.path.join(path, file)
        break
        
column_names = ['ID', 'Timestamp', 'Date', 'Query', 'User', 'Tweet', 'Text']
df = pd.read_csv(file_path, encoding='latin1', header=None, names=column_names)  # Use 'latin1' encoding if there are any encoding issues
df.head()

Unnamed: 0,ID,Timestamp,Date,Query,User,Tweet,Text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",


In [33]:
import re
# Separate plain words from hashtags and mentions
plain_terms = ['Tesla', 'Lucid Motors', 'Rivian', 'Nikola', 'BYD', 'Polestar', 'ChargePoint',
               'Electric Cars', 'Electric Vehicle', 'EV market']
hashtag_terms = ['#Tesla', '#Lucid', '#Rivian', '#Nikola', '#BYDCompany', '#Polestar', '#ChargePoint', '#EV ', 
                 '#ElectricCar']
mention_terms = ['@Tesla', '@Rivian', '@LucidMotors', '@ChargePoint', 
                 '@BYDCompany', '@NIOGlobal', '@ElectricVehicles']

# Add word boundaries to plain terms
plain_terms = [r'\b' + re.escape(term) + r'\b' for term in plain_terms]

# Escape hashtags and mentions (they don't need word boundaries)
hashtag_terms = [re.escape(term) for term in hashtag_terms]
mention_terms = [re.escape(term) for term in mention_terms]

# Combine all terms into a single regex pattern
all_terms = plain_terms + hashtag_terms + mention_terms
search_pattern = '|'.join(all_terms)

# Filter the DataFrame
df_filtered = df[df['Tweet'].str.contains(search_pattern, case=False, na=False, regex=True)]

# Preview the results
df_filtered.head()

Unnamed: 0,ID,Timestamp,Date,Query,User,Tweet,Text
150302,0,1898405077,Sat May 23 18:06:08 PDT 2009,NO_QUERY,Marcy_M,@TeslaGirl360 This could have been us. http...,
166174,0,1961175165,Fri May 29 09:05:52 PDT 2009,NO_QUERY,graceengle,No launch today. Teacher changed plans and we ...,
172317,0,1963531905,Fri May 29 12:51:54 PDT 2009,NO_QUERY,alaskamiller,No one is at the tesla dealership,
254812,0,1984340771,Sun May 31 14:55:05 PDT 2009,NO_QUERY,chimz,@nluchs Hell yes. They had a lot of cool thin...,
261218,0,1985862397,Sun May 31 17:54:51 PDT 2009,NO_QUERY,66Gia66,The Tesla coil in action made the boy start to...,


In [35]:
df_filtered['Tweet'].unique()

array(['@TeslaGirl360 This could have been us.    http://bit.ly/oWZzF',
       'No launch today. Teacher changed plans and we watched a stupid move about tesla ',
       'No one is at the tesla dealership ',
       "@nluchs Hell yes.  They had a lot of cool things at Maker Faire SF yesterday but the singing tesla coils weren't there. ",
       'The Tesla coil in action made the boy start to scream &amp; cry  &amp; we had to leave. http://yfrog.com/591bjj',
       "@teslaman2003 Haven't seen it.  Haven't really spent much time looking around online ",
       'In Japan, Testing the Market for All-Electric Cars http://bit.ly/Qq2HW (via @markidea) they need to extend their range..for farther trip! ',
       '@teslagold if only i had a car. your still moving to az??? ',
       '@Teslanaut so most things then ',
       "Bernoulli &amp; Tesla: why do Bunsen get to be the picture?  we'z cute too!",
       '@teslaaa your such a procrastinator. the end. p.s i better see you before you leave! ',


In [ ]:
import praw

# Set up Reddit API credentials
reddit = praw.Reddit(
    client_id='WLlWYMAIAM2lYVJhucU9iQ',  # replace with your client_id
    client_secret='wgsE_SCaXLtmwyefnFT1sOOZx0kkHA',  # replace with your client_secret
    user_agent='EV Scraper by Zoe Tomlinson (contact: zoetomlinson@example.com)'  # more descriptive user-agent
)

# Define a single subreddit and simplified query to test
subreddits = ['r/teslamotors']  # Test with just one subreddit first
query = "Tesla"  # Simplified query for testing

# List to store scraped posts
posts = []

# Scrape posts from the subreddit
for subreddit in subreddits:
    for submission in reddit.subreddit(subreddit).search(query, sort='new', limit=10):  # Limit to 10 posts for testing
        # Check if 'selftext' is not empty
        text = submission.selftext if submission.selftext else "No text available"
        posts.append([submission.title, text, submission.score, submission.url, submission.subreddit, submission.created_utc])

# Create a DataFrame from the scraped data
df = pd.DataFrame(posts, columns=['Title', 'Text', 'Score', 'URL', 'Subreddit', 'Created UTC'])

# Convert timestamps to readable date format
df['Created UTC'] = pd.to_datetime(df['Created UTC'], unit='s')

# Optionally, save the data to a CSV file
df.to_csv('ev_posts.csv', index=False)
df