# Lab work №6

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from textblob import TextBlob
import random

# Reading data from the file movie1.csv
data = pd.read_csv("movie1.csv")

# Preprocessing data and splitting into training and testing sets
X = data["text"]
y = data["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorizing text using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Training logistic regression model
classifier = LogisticRegression()
classifier.fit(X_train_vectorized, y_train)
y_pred = classifier.predict(X_test_vectorized)

# Displaying confusion matrix and accuracy of the model
print("Confusion Matrix for Logistic Regression:")
print(confusion_matrix(y_test, y_pred))
print("Accuracy for Logistic Regression:", accuracy_score(y_test, y_pred))


# Sentiment analysis using TextBlob
def analyze_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity >= 0.1:
        return 1
    elif polarity < 0.1:
        return 0
    

# Evaluating sentiment using TextBlob
y_pred_textblob = [analyze_sentiment(review) for review in X_test]

# Displaying confusion matrix and accuracy of the model (TextBlob)
print("\nConfusion Matrix for TextBlob:")
print(confusion_matrix(y_test, y_pred_textblob))
print("Accuracy for TextBlob:", accuracy_score(y_test, y_pred_textblob))

# Selecting three random entries and displaying sentiment evaluation results
random_samples = random.sample(range(len(data)), 3)
for idx in random_samples:
    review = data.loc[idx, "text"]
    actual_sentiment = data.loc[idx, "label"]
    review_vectorized = vectorizer.transform([review])
    predicted_sentiment_logistic = classifier.predict(review_vectorized)[0]
    predicted_sentiment_textblob = analyze_sentiment(review)
    print("\nReview:", review)
    print("Actual Sentiment:", actual_sentiment)
    print("Predicted Sentiment (Logistic Regression):", predicted_sentiment_logistic)
    print("Predicted Sentiment (TextBlob):", predicted_sentiment_textblob)


Confusion Matrix for Logistic Regression:
[[1770  261]
 [ 214 1755]]
Accuracy for Logistic Regression: 0.88125

Confusion Matrix for TextBlob:
[[1528  503]
 [ 442 1527]]
Accuracy for TextBlob: 0.76375

Review: Picking this up along with the rest of the Marx Brothers box set, I found myself disappointed by most everything beyond A Night at the Opera. This stinker is prolly the worst I've seen of them so far, with the clever lines left out and the characterization is woeful. The playwright is so obscenely stupid in this play it's hard not to tackle the television and try and strangle him.<br /><br />As it is, the Marxes seem to do better as outsiders brought in to wreak havoc, and are much much better when they have a good gag or two at least. The material here is all obviously written for anyone, and it really wastes the Marx's talent. Avoid. <br /><br />Rating: 3/10
Actual Sentiment: 0
Predicted Sentiment (Logistic Regression): 0
Predicted Sentiment (TextBlob): 0

Review: All those who

In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")

# Task a)
def find_non_stopwords(text):
    doc = nlp(text)
    non_stopwords = [token.text for token in doc if not token.is_stop and token.is_alpha]
    return non_stopwords

# Task b)
def find_verbs(text):
    doc = nlp(text)
    verbs = [token.text for token in doc if token.pos_ == "VERB"]
    return verbs

# Task c)
def find_numbers_and_persons(text):
    doc = nlp(text)
    numbers = [entity.text for entity in doc.ents if entity.label_ == "CARDINAL"]
    persons = [entity.text for entity in doc.ents if entity.label_ == "PERSON"]
    return numbers, persons

# Reading text from the file lab6-3.txt
with open("lab6-3.txt", "r", encoding="utf-8") as file:
    text = file.read()

non_stopwords = find_non_stopwords(text)
verbs = find_verbs(text)
numbers, persons = find_numbers_and_persons(text)

print("a) Words that are not stopwords:", non_stopwords)
print("b) Verbs in the text:", verbs)
print("c1) Numbers in the text:", numbers)
print("c2) Persons in the text:", persons)


a) Words that are not stopwords: ['Gordon', 'Brown', 'issued', 'rallying', 'cry', 'telling', 'supporters', 'stakes', 'high', 'stay', 'home', 'protest', 'vote', 'forthcoming', 'general', 'election', 'chancellor', 'said', 'vote', 'expected', 'fall', 'clear', 'fundamental', 'choice', 'Labour', 'investment', 'Conservative', 'cuts', 'Speaking', 'Labour', 'spring', 'conference', 'Gateshead', 'Mr', 'Brown', 'claimed', 'NHS', 'safe', 'Conservative', 'hands', 'said', 'Tory', 'plans', 'cut', 'tax', 'cut', 'deep', 'public', 'service', 'packed', 'audience', 'Gateshead', 'Sage', 'Centre', 'chancellor', 'said', 'cuts', 'proposed', 'shadow', 'chancellor', 'Oliver', 'Letwin', 'equivalent', 'sacking', 'teacher', 'GP', 'nurse', 'country', 'told', 'activists', 'Laying', 'Conservative', 'record', 'government', 'said', 'promise', 'Labour', 'Britain', 'return', 'mistakes', 'ERM', 'inflation', 'interest', 'rates', 'lost', 'reserves', 'repossessed', 'million', 'negative', 'equity', 'million', 'unemployed', 'T