# Library import

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import os
import spacy
from collections import Counter
import re
spacy.cli.download('pl_core_news_lg')
nlp = spacy.load('pl_core_news_lg')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import make_pipeline
import gensim
from gensim.models import Word2Vec
from gensim.models.phrases import Phraser, Phrases
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, explained_variance_score, max_error, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC,  LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.utils import compute_class_weight

[38;5;2mâœ” Download and installation successful[0m
You can now load the package via spacy.load('pl_core_news_lg')


# Functions

In [None]:
def get_bows(data_tokens_train, data_tokens_test, data_lemmas_train, data_lemmas_test):

  #unigrams
  #tokens
  CountVec = CountVectorizer(ngram_range=(1,1))

  bow_uni_train = CountVec.fit_transform(data_tokens_train).toarray()
  bow_uni_test = CountVec.transform(data_tokens_test).toarray()

  #lemmas
  CountVec = CountVectorizer(ngram_range=(1,1))

  bowl_uni_train = CountVec.fit_transform(data_lemmas_train).toarray()
  bowl_uni_test = CountVec.transform(data_tokens_test).toarray()


  return bow_uni_train, bow_uni_test,  bowl_uni_train, bowl_uni_test

def prep_features(X_train, X_test):

  scaler = StandardScaler()

  X_train = scaler.fit_transform(X_train)
  X_test = scaler.transform(X_test)

  pca = PCA(n_components = 0.9).fit(X_train)
  X_train_pca = pca.transform(X_train)
  X_test_pca = pca.transform(X_test)

  print(pca.n_components_)

  return X_train_pca, X_test_pca


def compare_models(X_train, X_test, y_train,y_test, cm_name):

  class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
  class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
  print(class_weight_dict)

  best_classifiers = {}

  #svc
  svc = SVC(class_weight = class_weight_dict)
  svc = svc.fit(X_train, y_train)
  svc_pred = svc.predict(X_test)
  print('svc classification results')
  print(classification_report(y_test, svc_pred))
  best_classifiers['svc'] = svc_pred

  print('_'*200)
  #lsvc
  lsvc = LinearSVC(class_weight = class_weight_dict)
  lsvc = lsvc.fit(X_train, y_train)
  lsvc_pred = lsvc.predict(X_test)
  print('lsvc classification results')
  print(classification_report(y_test, lsvc_pred))
  print('_'*200)
  best_classifiers['lsvc'] = lsvc_pred

  #nb
  nb = GaussianNB()#priors = np.array(list(class_weight_dict.values())) / sum(class_weight_dict.values()))
  nb = nb.fit(X_train, y_train)
  nb_pred = nb.predict(X_test)
  print('NB classification results')
  print(classification_report(y_test, nb_pred))
  print('_'*200)
  best_classifiers['nb'] = nb_pred

  #random forest
  rf = RandomForestClassifier(class_weight = class_weight_dict)
  rf = rf.fit(X_train, y_train)
  rf_pred = rf.predict(X_test)
  print('RF classification results')
  print(classification_report(y_test, rf_pred))
  print('_'*200)
  best_classifiers['rf'] = rf_pred


  #multilayer

  mlp = MLPClassifier(alpha = 0.05)
  mlp.class_prior_ = np.array(list(class_weight_dict.values())) / sum(class_weight_dict.values())
  mlp = mlp.fit(X_train, y_train)
  mlp_pred = mlp.predict(X_test)
  print('mlp classification results')
  print(classification_report(y_test, mlp_pred))
  print('_'*200)
  best_classifiers['mlp'] = mlp_pred

  #dt
  dt = DecisionTreeClassifier(class_weight = class_weight_dict)
  dt = dt.fit(X_train, y_train)
  dt_pred = dt.predict(X_test)
  print('dt classification results')
  print(classification_report(y_test, dt_pred))



  #ab
  ab = AdaBoostClassifier()
  ab = ab.fit(X_train, y_train)
  ab_pred = ab.predict(X_test)
  print('AB classification results')
  print(classification_report(y_test, ab_pred))
  print('_'*200)
  best_classifiers['ab'] = ab_pred


  #logistic regression
  reg = LogisticRegression(max_iter = 10000, class_weight = class_weight_dict)
  reg = reg.fit(X_train, y_train)
  reg_pred = reg.predict(X_test)
  best_classifiers['reg'] = reg_pred

  print('REG classification results')
  print(classification_report(y_test, reg_pred))
  print('_'*200)

  best_classifier = input('pick best classifier: ')

  cm = confusion_matrix(y_test, best_classifiers[best_classifier])
  disp = ConfusionMatrixDisplay(confusion_matrix=cm)
  plt.figure(figsize = [10,10])
  ax = sns.heatmap(cm, annot = True, fmt = 'd', annot_kws = {'size': 20} , cmap = sns.color_palette("Blues", as_cmap=True))
  ax.xaxis.set_ticklabels(['Coalition', 'Opposition'], fontsize = 20)
  ax.yaxis.set_ticklabels(['Coalition', 'Opposition'], fontsize = 20)
  ax.set_xlabel('Predicted label', fontsize = 20)
  ax.set_ylabel('True label', fontsize = 20)
  plt.savefig(cm_name)
  plt.show()


# Data


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = './drive/MyDrive/Master/thesis_codes/'

In [None]:
data_sejm = pd.read_csv(path+'data_senat_spacy.csv')

removing non coallition/opposition speakers

In [None]:
data_sejm = data_sejm.dropna(subset=['Party_status'])
data_sejm = data_sejm[data_sejm.astype(str)['lemmas'] != '[]']

Extracting the subset

In [None]:
sample_data = data_sejm.sample(n = 10000)
sample_data = sample_data.sample(frac=1).reset_index(drop=True)
sample_data

Assigning class labels

In [None]:
party_numbers = []
for p in sample_data['Party_status']:
  if p == 'Coalition':
    party_numbers.append(0)
  elif p == 'Opposition':
    party_numbers.append(1)

sample_data['Party_tag'] = party_numbers

splitting the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sample_data['tokens'], sample_data['Party_tag'], test_size=0.2, stratify = sample_data['Party_tag']  ,random_state=42)

In [None]:
X_trainl, X_testl, y_trainl, y_testl= train_test_split(sample_data['lemmas'], sample_data['Party_tag'], test_size=0.2, stratify = sample_data['Party_tag']  ,random_state=42)

# Calculating the features

In [None]:
bow_uni_train, bow_uni_test, bowl_uni_train, bowl_uni_test = get_bows(X_train, X_test, X_trainl, X_testl)

BOW

In [None]:
X_train_pca, X_test_pca = prep_features(bow_uni_train, bow_uni_test)

BOWL

In [None]:
X_trainl_pca, X_testl_pca = prep_features(bowl_uni_train, bowl_uni_test)

# Classification

In [None]:
compare_models(X_train_pca, X_test_pca, y_train,y_test, 'bow_morf_left_right.jpg')

In [None]:
compare_models(X_trainl_pca, X_testl_pca, y_trainl,y_testl, 'bow_morf_left_right.jpg')

# Balanced set


In [None]:
data_pis = data_sejm.loc[data_sejm['Party_status'] == 'Coalition'].sample(n = 5000)
data_ko = data_sejm.loc[data_sejm['Party_status'] == 'Opposition'].sample(n = 5000)
sample_data = pd.concat([data_pis, data_ko], ignore_index = True)

In [None]:
sample_data = sample_data.sample(frac=1).reset_index(drop=True)

In [None]:
party_numbers = []
for p in sample_data['Party_status']:
  if p == 'Coalition':
    party_numbers.append(0)
  elif p == 'Opposition':
    party_numbers.append(1)

In [None]:
sample_data['Party_tag'] = party_numbers

splitting the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sample_data['tokens'], sample_data['Party_tag'], test_size=0.2, stratify = sample_data['Party_tag']  ,random_state=42)

In [None]:
X_trainl, X_testl, y_trainl, y_testl= train_test_split(sample_data['lemmas'], sample_data['Party_tag'], test_size=0.2, stratify = sample_data['Party_tag']  ,random_state=42)

# Calculating the features

In [None]:
bow_uni_train, bow_uni_test, bowl_uni_train, bowl_uni_test = get_bows(X_train, X_test, X_trainl, X_testl)

BOW

In [None]:
X_train_pca, X_test_pca = prep_features(bow_uni_train, bow_uni_test)

BOWL

In [None]:
X_trainl_pca, X_testl_pca = prep_features(bowl_uni_train, bowl_uni_test)

# Classification

BOW

In [None]:
compare_models(X_train_pca, X_test_pca, y_train,y_test, 'bow_morf_left_right.jpg')

BOWL

In [None]:
compare_models(X_trainl_pca, X_testl_pca, y_trainl, y_testl, 'bowl_morf_left_right.jpg')