In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier

In [None]:
# Initializing lists to hold data
reviews = []
sentiments = []

In [None]:
base_dir = "path_to_your_dataset"

In [None]:
dataset_types = ['train', 'test']
sentiments_list = ['pos', 'neg']

In [None]:
for dataset_type in dataset_types:
    for sentiment in sentiments_list:
        folder_path = os.path.join(base_dir, dataset_type, sentiment)
        for filename in os.listdir(folder_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    review = file.read()
                    reviews.append(review)
                    sentiments.append(sentiment)

In [None]:
print(f"Number of reviews: {len(reviews)}")
print(f"Number of sentiments: {len(sentiments)}")

In [None]:
df = pd.DataFrame({
    'review': reviews, 
    'sentiment': sentiments
})

### Removing HTML tags and special characters

In [None]:
# Removing HTML tags and special characters
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [None]:
def remove_special_characters(text):
    return re.sub(r'[^A-Za-z0-9\s]', '', text)

In [None]:
df['cleaned_review'] = df['review'].apply(remove_html_tags).apply(remove_special_characters)
df['cleaned_review'] = df['cleaned_review'].str.lower()

In [None]:
df.to_csv('sentiment_analysis_dataset.csv', index=False, date_format='%Y-%m-%d')

### TF-IDF Vectorization

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 2), max_features=50000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_review'])

In [None]:
# Reducing dimensionality
svd = TruncatedSVD(n_components=200)
tfidf_reduced = svd.fit_transform(tfidf_matrix)
tfidf_reduced_df = pd.DataFrame(tfidf_reduced)

In [None]:
df['sentiment'] = df['sentiment'].replace({'pos': 1, 'neg': 0})
y = df['sentiment']  # Target variable is sentiment

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_reduced_df, y, test_size=0.2, random_state=42)

## Logistic Regression

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")

## Random Forest

In [None]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf)}")

## SVM 

In [None]:
# Support Vector Machine
svm_model = SVC(C=1.0, kernel='linear')
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print(f"SVM accuracy: {accuracy_score(y_test, y_pred_svm)}")

## XGBoost 

In [None]:

xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy: {accuracy}")

## Hyperparameter Tuning & Cross Validation

In [None]:
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
}

param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30]
}

In [None]:
model_lr = LogisticRegression()
model_svm = SVC()
model_xgb = XGBClassifier()
model_rf = RandomForestClassifier()

In [None]:
# GridSearchCV for each model
grid_search_lr = GridSearchCV(estimator=model_lr, param_grid=param_grid_lr, cv=5, scoring='f1')
grid_search_svm = GridSearchCV(estimator=model_svm, param_grid=param_grid_svm, cv=5, scoring='f1')
grid_search_xgb = GridSearchCV(estimator=model_xgb, param_grid=param_grid_xgb, cv=5, scoring='f1')
grid_search_rf = GridSearchCV(estimator=model_rf, param_grid=param_grid_rf, cv=5, scoring='f1')

In [None]:
# Fit models
grid_search_lr.fit(X_train, y_train)
grid_search_svm.fit(X_train, y_train)
grid_search_xgb.fit(X_train, y_train)
grid_search_rf.fit(X_train, y_train

In [None]:
best_params_lr = grid_search_lr.best_params_
best_params_svm = grid_search_svm.best_params_
best_params_xgb = grid_search_xgb.best_params_
best_params_rf = grid_search_rf.best_params_

best_score_lr = grid_search_lr.best_score_
best_score_svm = grid_search_svm.best_score_
best_score_xgb = grid_search_xgb.best_score_
best_score_rf = grid_search_rf.best_score_

In [None]:
print(f"Best parameters for Logistic Regression: {best_params_lr}")
print(f"Best F1-score for Logistic Regression: {best_score_lr}")

print(f"Best parameters for SVM: {best_params_svm}")
print(f"Best F1-score for SVM: {best_score_svm}")

print(f"Best parameters for XGBoost: {best_params_xgb}")
print(f"Best F1-score for XGBoost: {best_score_xgb}")

print(f"Best parameters for Random Forest: {best_params_rf}")
print(f"Best F1-score for Random Forest: {best_score_rf}")

## Final model evaluation on test set

In [None]:
final_model_lr = LogisticRegression(**best_params_lr)
final_model_lr.fit(X_train, y_train)
y_pred_lr = final_model_lr.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)

print(f"Logistic Regression - Accuracy: {accuracy_lr}, Precision: {precision_lr}, Recall: {recall_lr}, F1 Score: {f1_lr}")

In [None]:
final_model_svm = SVC(**best_params_svm)
final_model_svm.fit(X_train, y_train)
y_pred_svm = final_model_svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
print(f"SVM - Accuracy: {accuracy_svm}, Precision: {precision_svm}, Recall: {recall_svm}, F1 Score: {f1_svm}")

In [None]:
final_model_xgb = XGBClassifier(**best_params_xgb)
final_model_xgb.fit(X_train, y_train)
y_pred_xgb = final_model_xgb.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
print(f"XGBoost - Accuracy: {accuracy_xgb}, Precision: {precision_xgb}, Recall: {recall_xgb}, F1 Score: {f1_xgb}")

In [None]:
final_model_rf = RandomForestClassifier(**best_params_rf)
final_model_rf.fit(X_train, y_train)
y_pred_rf = final_model_rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
print(f"Random Forest - Accuracy: {accuracy_rf}, Precision: {precision_rf}, Recall: {recall_rf}, F1 Score: {f1_rf}")