In [2]:
#sources code
#https://www.analyticsvidhya.com/blog/2019/07/how-get-started-nlp-6-unique-ways-perform-tokenization/
#https://huggingface.co/docs/tokenizers/pipeline
#https://stackoverflow.com/questions/41912083/nltk-tokenize-faster-way
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
#https://www.nltk.org/howto/wordnet.html
#https://subscription.packtpub.com/book/data/9781782167853/1/ch01lvl1sec15/looking-up-lemmas-and-synonyms-in-wordnet
#https://www.kaggle.com/code/roblexnana/nlp-with-nltk-tokenizing-text-and-wordnet-basics

In [1]:
import pandas as pd
from glob import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import classification_report

In [2]:
data_dirs = list(set(glob('/home/marilu/training_dfs/*.csv')) - 
                 set(glob('/home/marilu/training_dfs/*_ft.csv')) - 
                 set(glob('/home/marilu/training_dfs/*balanced.csv')))
data_dirs

['/home/marilu/training_dfs/feeling_thinking.csv',
 '/home/marilu/training_dfs/judging_perceiving.csv',
 '/home/marilu/training_dfs/nationality.csv',
 '/home/marilu/training_dfs/gender.csv',
 '/home/marilu/training_dfs/birth_year.csv',
 '/home/marilu/training_dfs/sensing_intuitive.csv',
 '/home/marilu/training_dfs/extrovert_introvert.csv',
 '/home/marilu/training_dfs/political_leaning.csv']

In [4]:
data = pd.read_csv(data_dirs[7])
data

Unnamed: 0,auhtor_ID,post,political_leaning
0,t2_7ramzeng,"You can ""buy"" the show and stream it through t...",right
1,t2_7ramzeng,"me want to play Q*bert Holy shit, based Alex J...",right
2,t2_7ramzeng,Shouldn't rely on any external services or per...,right
3,t2_7ramzeng,PR to a specific person. Usually that just mea...,right
4,t2_7ramzeng,This article's intention is clear that they wa...,right
...,...,...,...
114458,t2_vi35s,hard as I have to go out of my way to find med...,center
114459,t2_vi35s,"WORLD WILL BE MINE! Well if you read it, then ...",center
114460,t2_vyu81f9,Wow super passing there sir. I’m jelly. Aesthe...,left
114461,t2_vyu81f9,compliment your face. Okay fair enough. I supp...,left


In [5]:
vec = TfidfVectorizer()
X_post = vec.fit_transform(data['post'])

In [6]:
y_political = data['political_leaning']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_post, y_political, test_size=0.3, random_state=42)

In [8]:
classifier = LogisticRegression(max_iter=6000)

In [9]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=6000)

In [10]:
#y_pred
predictions = classifier.predict(X_test)

In [11]:
print("Classification report:")
print(classification_report(y_test, predictions))

Classification report:
              precision    recall  f1-score   support

      center       0.65      0.77      0.71     14160
        left       0.69      0.61      0.65      9625
       right       0.73      0.63      0.68     10554

    accuracy                           0.68     34339
   macro avg       0.69      0.67      0.68     34339
weighted avg       0.69      0.68      0.68     34339



In [12]:
#Tokenize the posts
tokenized_posts = data['post'].apply(lambda x: x.split())

In [13]:
#Create word-to-number mapping
word_to_num = {}
current_num = 1 

In [14]:
for post in tokenized_posts:
    for word in post:
        if word not in word_to_num:
            word_to_num[word] = current_num
            current_num += 1

In [15]:
#Replace words with numbers in the dataset
obfuscated_data = data.copy()
obfuscated_data['obfuscated_post'] = tokenized_posts.apply(lambda post: ' '.join(str(word_to_num[word]) for word in post))

In [16]:
vec_new = TfidfVectorizer()
X_post_new = vec.fit_transform(obfuscated_data['obfuscated_post'])
y_political_new = obfuscated_data['political_leaning']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_post_new, y_political_new, test_size=0.3, random_state=42)

In [22]:
classifier = LogisticRegression(max_iter=6000)
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=6000)

In [23]:
predictions = classifier.predict(X_test)

In [24]:
print("Classification report:")
print(classification_report(y_test, predictions))

Classification report:
              precision    recall  f1-score   support

      center       0.66      0.78      0.71     14160
        left       0.70      0.62      0.66      9625
       right       0.73      0.63      0.68     10554

    accuracy                           0.69     34339
   macro avg       0.70      0.67      0.68     34339
weighted avg       0.69      0.69      0.68     34339



In [25]:
obfuscated_data

Unnamed: 0,auhtor_ID,post,political_leaning,obfuscated_post
0,t2_7ramzeng,"You can ""buy"" the show and stream it through t...",right,1 2 3 4 5 6 7 8 9 10 11 4 12 13 14 15 16 17 18...
1,t2_7ramzeng,"me want to play Q*bert Holy shit, based Alex J...",right,146 760 59 761 762 763 764 765 766 767 768 769...
2,t2_7ramzeng,Shouldn't rely on any external services or per...,right,1295 1296 192 569 1297 1227 611 1298 1047 203 ...
3,t2_7ramzeng,PR to a specific person. Usually that just mea...,right,1690 59 94 1527 1730 1731 70 72 1399 94 1732 1...
4,t2_7ramzeng,This article's intention is clear that they wa...,right,387 2145 2146 19 2147 70 164 760 59 2148 94 21...
...,...,...,...,...
114458,t2_vi35s,hard as I have to go out of my way to find med...,center,1631 856 17 93 59 132 99 52 178 195 59 538 100...
114459,t2_vi35s,"WORLD WILL BE MINE! Well if you read it, then ...",center,48885 29707 32168 2045829 7407 211 106 1570 10...
114460,t2_vyu81f9,Wow super passing there sir. I’m jelly. Aesthe...,left,30091 7916 621 27 45274 707 20277 568473 33419...
114461,t2_vyu81f9,compliment your face. Okay fair enough. I supp...,left,79019 216 10651 24245 8183 1632 17 1083 44 146...


In [26]:
import nltk
from nltk.corpus import wordnet
import random

In [27]:
#Function to replace words with their synonyms
def replace_synonyms(sentence):
    tokens = nltk.word_tokenize(sentence)
    new_tokens = []
    for token in tokens:
        syns = wordnet.synsets(token)
        if syns:
            synonym = syns[0].lemmas()[0].name()
            new_tokens.append(synonym)
        else:
            new_tokens.append(token)
    return ' '.join(new_tokens)

In [28]:
#Apply obfuscation to the posts
obfuscated_data['paraphrased_post'] = data['post'].apply(replace_synonyms)

In [29]:
X_paraphrased = obfuscated_data['paraphrased_post']
y_paraphrased = obfuscated_data['political_leaning']
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_paraphrased, y_paraphrased, test_size=0.3, random_state=42)

In [30]:
vec_paraphrased = TfidfVectorizer()
X_train_vec_p = vec_paraphrased.fit_transform(X_train_p)
X_test_vec_p = vec_paraphrased.transform(X_test_p)

In [31]:
classifier_p = LogisticRegression(max_iter=6000)
classifier_p.fit(X_train_vec_p, y_train_p)

LogisticRegression(max_iter=6000)

In [32]:
predictions_p = classifier_p.predict(X_test_vec_p)

In [33]:
print("Classification report for paraphrased posts:")
print(classification_report(y_test_p, predictions_p))

Classification report for paraphrased posts:
              precision    recall  f1-score   support

      center       0.65      0.76      0.70     14160
        left       0.68      0.61      0.64      9625
       right       0.72      0.63      0.67     10554

    accuracy                           0.68     34339
   macro avg       0.68      0.66      0.67     34339
weighted avg       0.68      0.68      0.67     34339



In [34]:
X_paraphrased

0         You can `` bargain '' the show and stream info...
1         Maine privation to play Q * bert holy_place cr...
2         Should n't trust on any external services Oreg...
3         praseodymium to angstrom particular person . n...
4         This article 's purpose be clear that they pri...
                                ...                        
114458    difficult arsenic iodine rich_person to go out...
114459    universe volition beryllium mine ! well if you...
114460    belly_laugh superintendent pass there sir . io...
114461    compliment your face . O.K. carnival enough . ...
114462    and attempt to populate yours fifty you spend ...
Name: paraphrased_post, Length: 114463, dtype: object