# Sentiment analysis on extracted dataset - classic ML approach

Applying simple SVM and Max Entropy (Logistic Regression) on the extracted dataset. Using 10-fold cross-validation.

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import pickle

### Loading data

In [9]:
#load from excel
def load_from_excel(file_name, text_header, label_header, pos_label, neg_label, neu_label, sheet_name=0):
    df = pd.read_excel(file_name, sheet_name=sheet_name, usecols=[text_header, label_header])
    df.rename(columns={text_header: 'texts', label_header: 'labels'}, inplace=True)

    label_mapping = {pos_label: 'p', neg_label: 'n', neu_label: '0'}
    df['labels'] = df['labels'].replace(label_mapping)
    df = df[df['labels'].isin([pos_label, neg_label, neu_label])]
    
    print('data loaded')
    
    return df

In [33]:
data = load_from_excel('extracted_dataset.xlsx', 'Text', 'Immer', 'p', 'n', '0')

data loaded


### Methods for getting and summarizing the results (report for each fold and then average for all)

In [26]:
def get_report(label_col, pred_col, output_dict=False):
    report = classification_report(label_col, pred_col, output_dict=output_dict)
    return report

def avg_reports(*args):
    mean_dict = dict()
    for label in reports[0].keys():
        dictionary = dict()

        if label in 'accuracy':
            mean_dict[label] = sum(d[label] for d in reports) / len(reports)
            continue

        for key in reports[0][label].keys():
            dictionary[key] = sum(d[label][key] for d in reports) / len(reports)
        mean_dict[label] = dictionary

    return mean_dict

### Using SVC

In [27]:
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=10, shuffle=True, random_state=1)
reports = []
vectorizer = TfidfVectorizer()

for train_index, test_index in kf.split(data):
    train_df = data.iloc[train_index]
    test_df = data.iloc[test_index]
    
    X_train = train_df.iloc[:, 1]
    y_train = train_df.iloc[:, 0]
    X_test = test_df.iloc[:, 1]
    y_test = test_df.iloc[:, 0]
    
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    model = SVC(kernel='linear')
    model.fit(X_train_vec, y_train)
    
    y_pred = model.predict(X_test_vec)

    report = get_report(y_pred, y_test, output_dict=True)
    reports.append(report)

avg_report = avg_reports(reports)
print(avg_report)

{'0': {'precision': 0.5697392408435108, 'recall': 0.5571626304981014, 'f1-score': 0.5630748941046918, 'support': 197.4}, 'n': {'precision': 0.6138254324705915, 'recall': 0.6173941679854986, 'f1-score': 0.6151772128034447, 'support': 192.4}, 'p': {'precision': 0.5829469751868663, 'recall': 0.5925999725543032, 'f1-score': 0.5873166715453804, 'support': 190.3}, 'accuracy': 0.5890385186064455, 'macro avg': {'precision': 0.5888372161669896, 'recall': 0.5890522570126344, 'f1-score': 0.5885229261511723, 'support': 580.1}, 'weighted avg': {'precision': 0.5896890516823137, 'recall': 0.5890385186064455, 'f1-score': 0.5889417528312559, 'support': 580.1}}


### Using Max Entropy (Logistic Regression)

In [28]:
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=10, shuffle=True, random_state=1)
reports = []
vectorizer = TfidfVectorizer()

for train_index, test_index in kf.split(data):
    train_df = data.iloc[train_index]
    test_df = data.iloc[test_index]
    
    X_train = train_df.iloc[:, 1]
    y_train = train_df.iloc[:, 0]
    X_test = test_df.iloc[:, 1]
    y_test = test_df.iloc[:, 0]
    
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    model = LogisticRegression(max_iter=1000) 
    model.fit(X_train_vec, y_train)
    
    y_pred = model.predict(X_test_vec)

    report = get_report(y_pred, y_test, output_dict=True)
    reports.append(report)

avg_report = avg_reports(reports)
print(avg_report)

{'0': {'precision': 0.557613285780765, 'recall': 0.5611477317641295, 'f1-score': 0.5590075138228174, 'support': 191.8}, 'n': {'precision': 0.6213881499036613, 'recall': 0.6122218671847743, 'f1-score': 0.6164671201594818, 'support': 196.2}, 'p': {'precision': 0.5866798615862718, 'recall': 0.5912620337085766, 'f1-score': 0.5882186185403022, 'support': 192.1}, 'accuracy': 0.5885215739806516, 'macro avg': {'precision': 0.5885604324235659, 'recall': 0.58821054421916, 'f1-score': 0.5878977508408672, 'support': 580.1}, 'weighted avg': {'precision': 0.5898918851713522, 'recall': 0.5885215739806516, 'f1-score': 0.588713665902263, 'support': 580.1}}


In [35]:
import os
import pandas as pd
from sklearn.model_selection import KFold

# Define label mapping to folder names
label_mapping = {'p': 'positive', 'n': 'negative', '0': 'neutral'}

# Create KFold splitter
kf = KFold(n_splits=10, shuffle=True, random_state=1)

# Base output directory
output_dir = "output_folds"
os.makedirs(output_dir, exist_ok=True)

# Create folds
for fold, (train_idx, test_idx) in enumerate(kf.split(data), 1):
    fold_dir = os.path.join(output_dir, f"fold_{fold}")
    for split, indices in [("train", train_idx), ("test", test_idx)]:
        for label, folder in label_mapping.items():
            split_dir = os.path.join(fold_dir, split, folder)
            os.makedirs(split_dir, exist_ok=True)
        
        # Save texts to corresponding folders
        for idx in indices:
            text = data.iloc[idx]["texts"]
            label = data.iloc[idx]["labels"]
            label_folder = os.path.join(fold_dir, split, label_mapping[label])
            # Write text to a .txt file
            text_file_path = os.path.join(label_folder, f"text_{idx}.txt")
            with open(text_file_path, "w", encoding="utf-8") as f:
                f.write(text)

print(f"Data has been split into 10 folds and saved in '{output_dir}'")


Data has been split into 10 folds and saved in 'output_folds'
