In [None]:
# Sentiment Analysis Model Development

In [None]:
## Loading

In [None]:
df = pd.read_csv('eda_data.csv')

In [None]:
# Function to create and return the Bag of Words DataFrame

In [None]:
def create_bow_representation(texts, use_tfidf=False, max_features=100):

In [None]:
    vectorizer = TfidfVectorizer(max_features=max_features) if use_tfidf else CountVectorizer(max_features=max_features)

In [None]:
    X = vectorizer.fit_transform(texts)

In [None]:
    feature_names = vectorizer.get_feature_names_out()

In [None]:
    df_bow = pd.DataFrame(X.toarray(), columns=feature_names)

In [None]:
    return df_bow, feature_names

In [None]:
df_bow, feature_names = create_bow_representation(df['cleaned_text'], use_tfidf=False, max_features=50)

In [None]:
def visualize_bow(df_bow, feature_names, annotate=False):

In [None]:
    plt.figure(figsize=(12, 8))

In [None]:
    sns.heatmap(df_bow, annot=annotate, cmap='viridis', cbar=False, xticklabels=feature_names)

In [None]:
    plt.title('Bag of Words Matrix')

In [None]:
    plt.xlabel('Features')

In [None]:
    plt.ylabel('Documents')

In [None]:
    plt.show()

In [None]:
visualize_bow(df_bow, feature_names, annotate=False)

In [None]:
# Logistic Regression Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
X = vectorizer.fit_transform(df['cleaned_text'])

In [None]:
y = df['class']

In [None]:
encoder = LabelEncoder()

In [None]:
y_encoded = encoder.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=35476648)

In [None]:
model = LogisticRegression(max_iter=1000)

In [None]:
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
print("Classification Report:")

In [None]:
print(classification_report(y_test, predictions))

In [None]:
probabilities = model.predict_proba(X_test)

In [None]:
auc_score = roc_auc_score(y_test, probabilities, multi_class='ovr')

In [None]:
print(f"ROC AUC Score: {auc_score:.2f}")

In [None]:
# SVM Model

In [None]:
from sklearn.svm import SVC

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [None]:
X = vectorizer.fit_transform(df['cleaned_text'])

In [None]:
y = df['class']

In [None]:
encoder = LabelEncoder()

In [None]:
y_encoded = encoder.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=35476648)

In [None]:
model = SVC(class_weight='balanced', probability=True)

In [None]:
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}

In [None]:
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro', verbose=1)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
best_model = grid_search.best_estimator_

In [None]:
predictions = best_model.predict(X_test)

In [None]:
probs = best_model.predict_proba(X_test)

In [None]:
auc_score = roc_auc_score(y_test, probs, multi_class='ovr')

In [None]:
print("Classification Report:")

In [None]:
print(classification_report(y_test, predictions))

In [None]:
print(f"ROC AUC Score: {auc_score:.2f}")

In [None]:
# Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
vectorizer = TfidfVectorizer(max_features=1000)

In [None]:
X = vectorizer.fit_transform(df['cleaned_text'])

In [None]:
y = df['class']

In [None]:
encoder = LabelEncoder()

In [None]:
y_encoded = encoder.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=35476648)

In [None]:
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

In [None]:
param_grid = {

In [None]:
    'n_estimators': [100, 200, 300],

In [None]:
    'max_depth': [10, 20, 30],

In [None]:
    'min_samples_split': [2, 5, 10],

In [None]:
    'min_samples_leaf': [1, 2, 4]

In [None]:
}

In [None]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
best_model = grid_search.best_estimator_

In [None]:
predictions = best_model.predict(X_test)

In [None]:
probs = best_model.predict_proba(X_test)