In [10]:
import pandas as pd
from sentence_transformers import SentenceTransformer

df = pd.read_csv('data/sentences.csv',index_col=0)
df = df[~df['policy_area_gold'].isna()]

df.head()

Unnamed: 0,sentenceid,manifestoid,party,year,sentence_text_old,sentence_nchars,sentence_nwords,sentence_mean_word_length,total_syllables,n_complex_words,...,verbs,adjs,advs,propnouns,truncated,sentence_text,policy_area_gold,econ_scale_gold,soc_scale_gold,X_gold
4,10000031,Con 1987,Conservatives,1987,We have risen to fresh challenges at home and ...,53,10,4.4,14,1,...,2,1,1,0,,We have risen to fresh challenges at home and ...,1.0,,,True
16,10000151,Con 1987,Conservatives,1987,A Conservative dream is at last becoming a rea...,51,9,4.777778,17,3,...,2,2,0,0,,A Conservative dream is at last becoming a rea...,1.0,,,True
17,10000161,Con 1987,Conservatives,1987,This Manifesto points the way forward.,38,6,5.5,10,1,...,1,0,1,1,,This Manifesto points the way forward.,1.0,,,True
18,10000171,Con 1987,Conservatives,1987,THE BRITISH REVIVAL. This manifesto sets out o...,218,36,5.083333,60,6,...,2,1,0,2,,The British Revival. This manifesto sets out o...,1.0,,,True
22,10000211,Con 1987,Conservatives,1987,That the leader of the Transport and General W...,113,21,4.428571,30,2,...,2,1,2,4,,That the leader of the Transport and General W...,2.0,1.0,,True


In [11]:
model = SentenceTransformer('all-MiniLM-L12-v2', device='cuda')
embeddings = model.encode(df['sentence_text'].tolist(),show_progress_bar=True)
embeddings.shape

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

(497, 384)

In [12]:
df['position'] = 'Neutral'
df.loc[df['econ_scale_gold']==1,'position'] = 'Econ right'
df.loc[df['econ_scale_gold']==-1,'position'] = 'Econ left'
df.loc[df['soc_scale_gold']==1,'position'] = 'Social con'
df.loc[df['soc_scale_gold']==-1,'position'] = 'Social lib'

df.head()

Unnamed: 0,sentenceid,manifestoid,party,year,sentence_text_old,sentence_nchars,sentence_nwords,sentence_mean_word_length,total_syllables,n_complex_words,...,adjs,advs,propnouns,truncated,sentence_text,policy_area_gold,econ_scale_gold,soc_scale_gold,X_gold,position
4,10000031,Con 1987,Conservatives,1987,We have risen to fresh challenges at home and ...,53,10,4.4,14,1,...,1,1,0,,We have risen to fresh challenges at home and ...,1.0,,,True,Neutral
16,10000151,Con 1987,Conservatives,1987,A Conservative dream is at last becoming a rea...,51,9,4.777778,17,3,...,2,0,0,,A Conservative dream is at last becoming a rea...,1.0,,,True,Neutral
17,10000161,Con 1987,Conservatives,1987,This Manifesto points the way forward.,38,6,5.5,10,1,...,0,1,1,,This Manifesto points the way forward.,1.0,,,True,Neutral
18,10000171,Con 1987,Conservatives,1987,THE BRITISH REVIVAL. This manifesto sets out o...,218,36,5.083333,60,6,...,1,0,2,,The British Revival. This manifesto sets out o...,1.0,,,True,Neutral
22,10000211,Con 1987,Conservatives,1987,That the leader of the Transport and General W...,113,21,4.428571,30,2,...,1,2,4,,That the leader of the Transport and General W...,2.0,1.0,,True,Econ right


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(embeddings, df['position'], stratify=df['position'], test_size=0.5, random_state=42)

reg = LogisticRegressionCV(max_iter=10_000)
reg.fit(X_train,y_train)

print(classification_report(y_test,reg.predict(X_test)))

              precision    recall  f1-score   support

   Econ left       0.83      0.69      0.75        29
  Econ right       0.81      0.79      0.80        78
     Neutral       0.81      0.96      0.88        92
  Social con       0.90      0.78      0.84        23
  Social lib       0.80      0.59      0.68        27

    accuracy                           0.82       249
   macro avg       0.83      0.76      0.79       249
weighted avg       0.82      0.82      0.81       249



In [22]:
reg.predict(model.encode("We will ban homosexuality").reshape(1, -1))

array(['Social lib'], dtype=object)