In [None]:
import os
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.sentiment import SentimentIntensityAnalyzer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam

# Load dataset
df = pd.read_csv("news_scraping_results.csv")

# Data Cleaning
df = df.dropna().reset_index(drop=True)

def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z ]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

df['clean_content'] = df['content'].apply(clean_text)

# Sentiment Labeling
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

def get_sentiment(text):
    score = sia.polarity_scores(text)
    if score['compound'] > 0.05:
        return 'positive'
    elif score['compound'] < -0.05:
        return 'negative'
    else:
        return 'neutral'

df['sentiment'] = df['clean_content'].apply(get_sentiment)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(df['clean_content'], df['sentiment'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Model 1: Support Vector Machine with GridSearchCV
svm_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svm_model = GridSearchCV(SVC(), svm_params, cv=5)
svm_model.fit(X_train_tfidf, y_train)
svm_preds = svm_model.predict(X_test_tfidf)
print("SVM Best Params:", svm_model.best_params_)
print("SVM Accuracy:", accuracy_score(y_test, svm_preds))

# Model 2: Random Forest with Tuning
rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
rf_preds = rf_model.predict(X_test_tfidf)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))

# Model 3: Improved LSTM Deep Learning
max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len)

model = Sequential([
    Embedding(input_dim=max_words, output_dim=100, input_length=max_len),
    Bidirectional(LSTM(128, return_sequences=True)),
    LSTM(64),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(3, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.0005), metrics=['accuracy'])
model.fit(X_train_seq, y_train.factorize()[0], epochs=20, batch_size=32, validation_data=(X_test_seq, y_test.factorize()[0]))

# Evaluasi Model
lstm_accuracy = model.evaluate(X_test_seq, y_test.factorize()[0], verbose=0)[1]
print("LSTM Accuracy:", lstm_accuracy)
