In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
df = pd.read_csv("stock_tweets.csv")

In [5]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization
    tokens = [stemmer.stem(token) for token in tokens if token.isalnum()]  # Stemming
    tokens = [token for token in tokens if token not in stop_words]  # Removing stop words
    return ' '.join(tokens)


In [6]:
df['Processed Tweet'] = df['Tweet'].apply(preprocess_text)

In [7]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Processed Tweet'])

In [8]:
def search(query, top_n=5):
    query_vector = vectorizer.transform([preprocess_text(query)])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    related_indices = cosine_similarities.argsort()[::-1][:top_n]
    return df.iloc[related_indices]

In [9]:
query = "stock market"
search_results = search(query)
print(search_results[['Date', 'Tweet', 'Stock Name', 'Company Name']])

                            Date  \
3417   2022-08-16 18:41:55+00:00   
19222  2022-03-09 15:19:45+00:00   
60047  2021-12-13 14:36:59+00:00   
18288  2022-03-21 16:52:17+00:00   
32993  2021-11-02 14:49:52+00:00   

                                                   Tweet Stock Name  \
3417                      What is up with markets. $TSLA       TSLA   
19222                  These markets are bi-polar. $TSLA       TSLA   
60047      $AAPL is about to be the entire stock market.       AAPL   
18288  I personally love it when $TSLA goes against t...       TSLA   
32993  Why even bother having a stock market when it'...       TSLA   

      Company Name  
3417   Tesla, Inc.  
19222  Tesla, Inc.  
60047   Apple Inc.  
18288  Tesla, Inc.  
32993  Tesla, Inc.  
