# Sentiment analysis on synthetic dataset - classic ML approach

Applying simple SVM and Max Entropy (Logistic Regression) on the synthetic dataset. Using 10-fold cross-validation.

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import pickle

### Loading data

In [2]:
#loads from txt files that seperately habe positive, negative and neutral texts
def load_from_separate_txts(pos_file, neg_file, neu_file, encoding='utf-8'):
    with open(pos_file, 'r', encoding=encoding) as file:
        pos_texts = file.readlines()
        
    with open(neg_file, 'r', encoding=encoding) as file:
        neg_texts = file.readlines()
        
    with open(neu_file, 'r', encoding=encoding) as file:
        neu_texts = file.readlines()

    texts=[text.strip() for text in pos_texts]+[text.strip() for text in neg_texts]+[text.strip() for text in neu_texts]
    labels=['p']*len(pos_texts)+['n']*len(neg_texts)+['0']*len(neu_texts)

    data = {
        'texts': texts,
        'labels': labels
    }
    
    df = pd.DataFrame(data)

    print('data loaded')
    
    return df


In [3]:
data = load_from_separate_txts('datasets/synthetic_dataset/positive.txt', 'datasets/synthetic_dataset/negative.txt', 'datasets/synthetic_dataset/neutral.txt')

data loaded


In [4]:
print(data)

                                                  texts labels
0     Vláda schválila balíček opatření, které povzbu...      p
1     Podle posledních dat se očekává, že export zem...      p
2     Automobilový průmysl očekává růst prodejů v ná...      p
3     Růst oblíbenosti místních restaurací podporuje...      p
4     VÝSTAVBA NOVÝCH PODNIKATELSKÝCH ZÓN NAVÝŠÍ HOS...      p
...                                                 ...    ...
7495  Vláda oznámila plán restrukturalizace veřejnéh...      0
7496  Moderní technologie propojily lidstvo do globá...      0
7497  Všechny členské státy se zúčastní nadcházející...      0
7498  V České republice je koncem roku 2021 registro...      0
7499  Podniky využívají různé strategie k maximaliza...      0

[7500 rows x 2 columns]


### Methods for getting and summarizing the results (report for each fold and then average for all)

In [4]:
def get_report(label_col, pred_col, output_dict=False):
    report = classification_report(label_col, pred_col, output_dict=output_dict)
    return report

def avg_reports(*args):
    mean_dict = dict()
    for label in reports[0].keys():
        dictionary = dict()

        if label in 'accuracy':
            mean_dict[label] = sum(d[label] for d in reports) / len(reports)
            continue

        for key in reports[0][label].keys():
            dictionary[key] = sum(d[label][key] for d in reports) / len(reports)
        mean_dict[label] = dictionary

    return mean_dict

### Using SVC

In [7]:
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=10, shuffle=True, random_state=1)
reports = []
vectorizer = TfidfVectorizer()

for train_index, test_index in kf.split(data):
    train_df = data.iloc[train_index]
    test_df = data.iloc[test_index]
    
    X_train = train_df.iloc[:, 0]
    y_train = train_df.iloc[:, 1]
    X_test = test_df.iloc[:, 0]
    y_test = test_df.iloc[:, 1]
    
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    model = SVC(kernel='linear')
    model.fit(X_train_vec, y_train)
    
    y_pred = model.predict(X_test_vec)

    report = get_report(y_pred, y_test, output_dict=True)
    reports.append(report)

avg_report = avg_reports(reports)
print(avg_report)

{'0': {'precision': 0.9558303369550455, 'recall': 0.9514017661082143, 'f1-score': 0.9534245051988417, 'support': 251.1}, 'n': {'precision': 0.9557658094737264, 'recall': 0.9496268557895527, 'f1-score': 0.9526157082430974, 'support': 251.5}, 'p': {'precision': 0.9410615658310417, 'recall': 0.95042734459317, 'f1-score': 0.9455634883124986, 'support': 247.4}, 'accuracy': 0.9506666666666665, 'macro avg': {'precision': 0.9508859040866044, 'recall': 0.9504853221636456, 'f1-score': 0.9505345672514792, 'support': 750.0}, 'weighted avg': {'precision': 0.9509390087828805, 'recall': 0.9506666666666665, 'f1-score': 0.9506524270078234, 'support': 750.0}}


### Using Max Entropy (Logistic Regression)

In [9]:
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=10, shuffle=True, random_state=1)
reports = []
vectorizer = TfidfVectorizer()

for train_index, test_index in kf.split(data):
    train_df = data.iloc[train_index]
    test_df = data.iloc[test_index]
    
    X_train = train_df.iloc[:, 0]
    y_train = train_df.iloc[:, 1]
    X_test = test_df.iloc[:, 0]
    y_test = test_df.iloc[:, 1]
    
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    model = LogisticRegression(max_iter=1000) 
    model.fit(X_train_vec, y_train)
    
    y_pred = model.predict(X_test_vec)

    report = get_report(y_pred, y_test, output_dict=True)
    reports.append(report)

avg_report = avg_reports(reports)
print(avg_report)

{'0': {'precision': 0.9605487317782136, 'recall': 0.9320340840188935, 'f1-score': 0.9458520115642578, 'support': 257.6}, 'n': {'precision': 0.941254518408547, 'recall': 0.9458076044594333, 'f1-score': 0.943379649893205, 'support': 248.6}, 'p': {'precision': 0.926399300189464, 'recall': 0.9494465467396276, 'f1-score': 0.9376857861762794, 'support': 243.8}, 'accuracy': 0.9425333333333334, 'macro avg': {'precision': 0.9427341834587416, 'recall': 0.9424294117393182, 'f1-score': 0.9423058158779142, 'support': 750.0}, 'weighted avg': {'precision': 0.9430740643764647, 'recall': 0.9425333333333334, 'f1-score': 0.9425236294201721, 'support': 750.0}}


In [6]:
import os
import pandas as pd
from sklearn.model_selection import KFold


df = data

# Define label mapping to folder names
label_mapping = {'p': 'positive', 'n': 'negative', '0': 'neutral'}

# Create KFold splitter
kf = KFold(n_splits=10, shuffle=True, random_state=1)

# Base output directory
output_dir = "output_folds_synth"
os.makedirs(output_dir, exist_ok=True)

# Create folds
for fold, (train_idx, test_idx) in enumerate(kf.split(df), 1):
    fold_dir = os.path.join(output_dir, f"fold_{fold}")
    for split, indices in [("train", train_idx), ("test", test_idx)]:
        for label, folder in label_mapping.items():
            split_dir = os.path.join(fold_dir, split, folder)
            os.makedirs(split_dir, exist_ok=True)
        
        # Save texts to corresponding folders
        for idx in indices:
            text = df.iloc[idx]["texts"]
            label = df.iloc[idx]["labels"]
            label_folder = os.path.join(fold_dir, split, label_mapping[label])
            # Write text to a .txt file
            text_file_path = os.path.join(label_folder, f"text_{idx}.txt")
            with open(text_file_path, "w", encoding="utf-8") as f:
                f.write(text)

print(f"Data has been split into 10 folds and saved in '{output_dir}'")

Data has been split into 10 folds and saved in 'output_folds_synth'


Exception in callback BaseSelectorEventLoop._read_from_self()
handle: <Handle BaseSelectorEventLoop._read_from_self()>
Traceback (most recent call last):
  File "C:\Python311\Lib\asyncio\events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "C:\Python311\Lib\asyncio\selector_events.py", line 119, in _read_from_self
    data = self._ssock.recv(4096)
           ^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
Exception in callback BaseSelectorEventLoop._read_from_self()
handle: <Handle BaseSelectorEventLoop._read_from_self()>
Traceback (most recent call last):
  File "C:\Python311\Lib\asyncio\events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "C:\Python311\Lib\asyncio\selector_events.py", line 119, in _read_from_self
    data = self._ssock.recv(4096)
           ^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection w