In [None]:
#https://www.nltk.org/api/nltk.translate.meteor_score.html
#https://www.nltk.org/_modules/nltk/translate/bleu_score.html
#https://www.geeksforgeeks.org/python-how-to-make-a-terminal-progress-bar-using-tqdm/
#https://stackoverflow.com/questions/70947892/best-smoothing-function-to-use-in-nltk-corpus-bleu-method
#https://stackoverflow.com/questions/68926574/i-compare-two-identical-sentences-with-bleu-nltk-and-dont-get-1-0-why
#https://stackoverflow.com/questions/77043285/nltk-sentence-bleu-returns-0-while-evaluating-chinese-sentences


In [1]:
import pandas as pd
from glob import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

In [2]:
data_dirs = list(set(glob('/home/marilu/training_dfs/*.csv')) - 
                 set(glob('/home/marilu/training_dfs/*_ft.csv')) - 
                 set(glob('/home/marilu/training_dfs/*balanced.csv')))
data_dirs

['/home/marilu/training_dfs/feeling_thinking.csv',
 '/home/marilu/training_dfs/extrovert_introvert.csv',
 '/home/marilu/training_dfs/nationality.csv',
 '/home/marilu/training_dfs/birth_year.csv',
 '/home/marilu/training_dfs/gender.csv',
 '/home/marilu/training_dfs/political_leaning.csv',
 '/home/marilu/training_dfs/sensing_intuitive.csv',
 '/home/marilu/training_dfs/judging_perceiving.csv']

In [3]:
data = pd.read_csv(data_dirs[5])
data

Unnamed: 0,auhtor_ID,post,political_leaning
0,t2_7ramzeng,"You can ""buy"" the show and stream it through t...",right
1,t2_7ramzeng,"me want to play Q*bert Holy shit, based Alex J...",right
2,t2_7ramzeng,Shouldn't rely on any external services or per...,right
3,t2_7ramzeng,PR to a specific person. Usually that just mea...,right
4,t2_7ramzeng,This article's intention is clear that they wa...,right
...,...,...,...
114458,t2_vi35s,hard as I have to go out of my way to find med...,center
114459,t2_vi35s,"WORLD WILL BE MINE! Well if you read it, then ...",center
114460,t2_vyu81f9,Wow super passing there sir. I’m jelly. Aesthe...,left
114461,t2_vyu81f9,compliment your face. Okay fair enough. I supp...,left


In [4]:
vec = TfidfVectorizer()
X_post = vec.fit_transform(data['post'])

In [5]:
y_political = data['political_leaning']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_post, y_political, test_size=0.3, random_state=42)

In [7]:
classifier = LogisticRegression(max_iter=6000)

In [8]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=6000)

In [9]:
#y_pred
predictions = classifier.predict(X_test)

In [10]:
from sklearn.metrics import classification_report

In [11]:
print("Classification report:")
print(classification_report(y_test, predictions))

Classification report:
              precision    recall  f1-score   support

      center       0.65      0.77      0.71     14160
        left       0.69      0.61      0.65      9625
       right       0.73      0.63      0.68     10554

    accuracy                           0.68     34339
   macro avg       0.69      0.67      0.68     34339
weighted avg       0.69      0.68      0.68     34339



In [12]:
#Tokenize the posts
tokenized_posts = data['post'].apply(lambda x: x.split())

In [13]:
#Create word-to-number mapping
word_to_num = {}
current_num = 1 

In [14]:
for post in tokenized_posts:
    for word in post:
        if word not in word_to_num:
            word_to_num[word] = current_num
            current_num += 1

In [15]:
#Replace words with numbers in the dataset
obfuscated_data = data.copy()
obfuscated_data['obfuscated_post'] = tokenized_posts.apply(lambda post: ' '.join(str(word_to_num[word]) for word in post))

In [16]:
vec_new = TfidfVectorizer()
X_post_new = vec.fit_transform(obfuscated_data['obfuscated_post'])
y_political_new = obfuscated_data['political_leaning']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_post_new, y_political_new, test_size=0.3, random_state=42)

In [18]:
classifier = LogisticRegression(max_iter=6000)
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=6000)

In [19]:
predictions_w2n = classifier.predict(X_test)

In [20]:
print("Classification report:")
print(classification_report(y_test, predictions_w2n))

Classification report:
              precision    recall  f1-score   support

      center       0.66      0.78      0.71     14160
        left       0.70      0.62      0.66      9625
       right       0.73      0.63      0.68     10554

    accuracy                           0.69     34339
   macro avg       0.70      0.67      0.68     34339
weighted avg       0.69      0.69      0.68     34339



In [21]:
from nltk.translate import meteor_score
from tqdm import tqdm

In [22]:
original_texts = data['post'].tolist()
obfuscated_texts = obfuscated_data['obfuscated_post'].tolist()

In [27]:
#choose sample size of 10%
sample_size = int(0.1 * len(original_texts)) 

sample_original_texts = original_texts[:sample_size]
sample_obfuscated_texts = obfuscated_texts[:sample_size]

#METEOR-score of sample size, comparing obfuscated and original text       
meteor_scores = []
with tqdm(total=sample_size) as pbar:
    for orig_text, geo_text in zip(sample_original_texts, sample_obfuscated_texts):
        meteor_scores.append(meteor_score.single_meteor_score(orig_text, geo_text))
        pbar.update(1)

100%|██████████| 11446/11446 [1:28:37<00:00,  2.15it/s]


In [28]:
meteor_score = sum(meteor_scores) / len(meteor_scores)
print("METEOR-score:", meteor_score)

METEOR-score: 0.0021900821457238244


In [35]:
from nltk.translate.bleu_score import corpus_bleu

In [36]:
sample_references = [[text.split()] for text in sample_original_texts]

In [37]:
sample_obfuscated = [text.split() for text in sample_obfuscated_texts]

In [38]:
with tqdm(total=1) as pbar:
    sample_blue_score = corpus_bleu(sample_references, sample_obfuscated)
    pbar.update(1)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
100%|██████████| 1/1 [01:27<00:00, 87.42s/it]


In [39]:
print("BLUE-score:", sample_blue_score)

BLUE-score: 3.802398089452196e-232


In [40]:
from nltk.translate.bleu_score import SmoothingFunction

sample_references = [[text.split()] for text in sample_original_texts]

sample_obfuscated = [text.split() for text in sample_obfuscated_texts]

In [41]:
#smoothing function
smoother = SmoothingFunction().method1

In [42]:
with tqdm(total=1) as pbar:
    sample_blue_score = corpus_bleu(sample_references, sample_obfuscated, smoothing_function=smoother)
    pbar.update(1)

100%|██████████| 1/1 [01:24<00:00, 84.51s/it]


In [43]:
print("BLUE-score:", sample_blue_score)

BLUE-score: 1.4053529525523834e-07
