In [2]:
import os
import pandas as pd
import string
from nltk.corpus import stopwords, opinion_lexicon
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import joblib
import nltk

# Load the class labelling model and vectorizer
classifier_model = joblib.load('/Users/meetsmacbook/Downloads/trained_models/classlabel_regression_model.pkl')
vectorizer = joblib.load('/Users/meetsmacbook/Downloads/trained_models/classlabel_vectorizer.pkl')

# Define constants for sentiment analysis
POSITIVE = 1
NEGATIVE = 0
NEUTRAL = -1

# Define lemmatizer and stop words for text preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define function to preprocess the text data
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Lemmatize and remove stop words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    # Join the words back into a single string
    return ' '.join(words)

# Define function to classify articles
def classify_articles(data):
    X = vectorizer.transform(data['processed_text'])
    data['Class Index'] = classifier_model.predict(X)
    return data

# Define function to get sentiment labels using VADER
def get_sentiment(text):
    sentiment_scores = analyzer.polarity_scores(text)
    compound_score = sentiment_scores['compound']
    
    if compound_score >= 0.05:
        return POSITIVE
    elif compound_score <= -0.05:
        return NEGATIVE
    else:
        return NEUTRAL

# Define function to apply sentiment analysis
def apply_sentiment_analysis(data):
    data['sentiment_label'] = data['processed_text'].apply(get_sentiment)
    return data

# Define function to detect bias
def detect_bias(data):
    subjectivity_threshold_high = 7.0
    subjectivity_threshold_moderate = 4.0
    sentiment_threshold_extreme = 0.7
    sentiment_threshold_moderate = 0.3
    
    # Define function to detect bias per row
    def detect_bias_per_row(row):
        subjectivity_score, sentiment_score = calculate_subjectivity_and_sentiment(row['processed_text'])
        
        if subjectivity_score >= subjectivity_threshold_high and abs(sentiment_score) >= sentiment_threshold_extreme:
            return 'High bias'
        elif subjectivity_score >= subjectivity_threshold_moderate and abs(sentiment_score) >= sentiment_threshold_moderate:
            return 'Moderate bias'
        elif subjectivity_score < subjectivity_threshold_moderate and abs(sentiment_score) < sentiment_threshold_moderate:
            return 'Low bias'
        else:
            return 'Unknown bias'
    
    # Apply bias detection to each row
    data['bias_label'] = data.apply(detect_bias_per_row, axis=1)
    
    return data

# Define function to calculate subjectivity and sentiment scores
def calculate_subjectivity_and_sentiment(text):
    subjectivity_score = 0
    sentiment_score = 0
    words = word_tokenize(text)
    
    for word in words:
        if word in subjectivity_lexicon:
            subjectivity_score += 1
    
    sentiment_scores = analyzer.polarity_scores(text)
    sentiment_score = sentiment_scores['compound']
    
    return subjectivity_score, sentiment_score

# Define main function to process the input CSV file
def main(input_file_path, output_file_path):
    # Load the data
    data = pd.read_csv(input_file_path)
    
    # Drop rows with any missing values
    data.dropna(axis=0, how='any', inplace=True)
    
    # Combine title and description into a single text column
    data['text'] = data['Title'] + ' ' + data['Description']
    
    # Preprocess the text
    data['processed_text'] = data['text'].apply(preprocess_text)
    
    # Classify articles
    data = classify_articles(data)
    
    # Apply sentiment analysis
    data = apply_sentiment_analysis(data)
    
    # Detect bias
    data = detect_bias(data)
    
    # Save the results to a new CSV file
    data.to_csv(output_file_path, index=False)
    print(f'Results saved to: {output_file_path}')

if __name__ == "__main__":
    
    subjectivity_lexicon = set(opinion_lexicon.words())

    
    # Specify the input and output file paths
    input_csv_file_path = '/Users/meetsmacbook/Desktop/Code/TY CCNLP Project/news_articles.csv'  # Specify the input CSV file path
    output_csv_file_path = '/Users/meetsmacbook/Desktop/Code/TY CCNLP Project/result.csv'  # Specify the output CSV file path
    
    # Run the main function
    main(input_csv_file_path, output_csv_file_path)


Results saved to: /Users/meetsmacbook/Desktop/Code/TY CCNLP Project/result.csv
