<a href="https://colab.research.google.com/github/juhi-11/Project4/blob/main/SentimentAnalysis_InfoRetrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


# Load your dataset
data = pd.read_csv('/content/uci-news-aggregator.csv')

# data.columns = ['source', 'author', 'title', 'description', 'url', 'published_at', 'sentiment', 'type']
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
print(data.columns)


Index(['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME',
       'TIMESTAMP'],
      dtype='object')


In [12]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    tokens = nltk.word_tokenize(text)
    filtered_words = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_words)

# Apply preprocessing to the TITLE column
data['cleaned_text'] = data['TITLE'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
!pip install scikit-learn nltk textblob



In [16]:
from textblob import TextBlob
def get_sentiment(text):
    analysis = TextBlob(text)
    # Classify sentiment as Positive, Negative, or Neutral
    if analysis.sentiment.polarity > 0:
        return 'POSITIVE'
    elif analysis.sentiment.polarity < 0:
        return 'NEGATIVE'
    else:
        return 'NEUTRAL'

# Apply sentiment analysis on the TITLE column
data['sentiment'] = data['TITLE'].apply(get_sentiment)


In [17]:
# Use TF-IDF to vectorize the cleaned TITLE data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['cleaned_text'])

In [18]:
def search_query(query, sentiment_filter):
    # Preprocess the query
    query = preprocess_text(query)

    # Transform the query to a vector
    query_vec = tfidf_vectorizer.transform([query])

    # Compute cosine similarity between the query and the documents
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

    # Filter by sentiment
    filtered_data = data[data['sentiment'] == sentiment_filter]
    filtered_similarities = similarities[filtered_data.index]

    # Get top 5 most similar documents
    top_indices = filtered_similarities.argsort()[-5:][::-1]
    results = filtered_data.iloc[top_indices]

    return results[['TITLE', 'PUBLISHER', 'sentiment', 'URL']]

# Example usage
query = "tech innovations"
sentiment_filter = "POSITIVE"  # Can be 'POSITIVE', 'NEGATIVE', or 'NEUTRAL'
results = search_query(query, sentiment_filter)
print(results)

                                                    TITLE  \
413835  For VMware, keep friendly tech close (and enem...   
136807  Fearing Tech: Americans report fear and love f...   
125156       Google Glass: the New Symbol of Tech Disgust   
335991                        The Great Tech Lull of 2014   
282638  TECH STOCKS: Intel, OpenTable Lead Tech Sector...   

                           PUBLISHER sentiment  \
413835                       Fortune  POSITIVE   
136807                    Tech Times  POSITIVE   
125156                  Equities.com  POSITIVE   
335991                  DailyFinance  POSITIVE   
282638  Capital.gr \(press release\)  POSITIVE   

                                                      URL  
413835  http://fortune.com/2014/08/25/vmware-vmworld-2...  
136807  http://www.techtimes.com/articles/5736/2014041...  
125156  http://www.equities.com/editors-desk/stocks/te...  
335991  http://www.dailyfinance.com/2014/06/29/the-gre...  
282638     http://english.capita

In [19]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Split the data for evaluation (if you have true sentiment labels)
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['sentiment'], test_size=0.2, random_state=42)

# Predict sentiments for the test data
y_pred = [get_sentiment(text) for text in X_test]

# Generate a classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    NEGATIVE       0.93      0.90      0.92     11475
     NEUTRAL       0.95      0.98      0.97     49661
    POSITIVE       0.95      0.92      0.93     23348

    accuracy                           0.95     84484
   macro avg       0.95      0.93      0.94     84484
weighted avg       0.95      0.95      0.95     84484

