## Bots and Gender Profiling
https://pan.webis.de/clef19/pan19-web/author-profiling.html

### loading the dataset from pickle and training a simple baseline

In [3]:
import pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer

#### unpickling the dataframe

In [4]:
df_train = pd.read_pickle("./pan19_df_clean_train_no_uris.pkl")
df_test = pd.read_pickle("./pan19_df_clean_test_no_uris.pkl")
print(f"train size: {df_train.shape}, test size: {df_test.shape}")

train size: (412000, 4), test size: (264000, 4)


In [5]:
df_train.head()

Unnamed: 0,author,tweet,bot,clean_tweet
0,7fbb9ceb600ebc6fcadc9ee235cda580,"During a brief chat, Hope Hicks accidentally c...",bot,brief chat hope hick accidentally convinces pr...
1,7fbb9ceb600ebc6fcadc9ee235cda580,"Sighing deeply, Lamar Smith informs President ...",bot,sighing deeply lamar smith informs president t...
2,7fbb9ceb600ebc6fcadc9ee235cda580,"At a playground, Stephen Miller mistakenly tel...",bot,playground stephen miller mistakenly tell bets...
3,7fbb9ceb600ebc6fcadc9ee235cda580,"On the South Lawn, Louie Gohmert persuades Jef...",bot,south lawn louie gohmert persuades jeff sessio...
4,7fbb9ceb600ebc6fcadc9ee235cda580,"In the West Wing, Sean Spicer boldly informs K...",bot,west wing sean spicer boldly informs kellyanne...


#### transforming target variables

In [6]:
multilabel_binarizer = LabelBinarizer()
multilabel_binarizer.fit(df_train['bot'])
list(multilabel_binarizer.classes_)

['bot', 'human']

In [7]:
ytrain = multilabel_binarizer.transform(df_train['bot'])
ytest = multilabel_binarizer.transform(df_test['bot'])
print(f"train dimensions: {len(ytrain), len(ytrain[0])}")
print(f"test dimensions: {len(ytest), len(ytest[0])}")

train dimensions: (412000, 1)
test dimensions: (264000, 1)


In [8]:
xtrain=df_train.clean_tweet
xtest=df_test.clean_tweet
print(f"train dimensions: {len(xtrain), len(xtrain[0])}")
print(f"test dimensions: {len(xtest), len(xtest[0])}")

train dimensions: (412000, 118)
test dimensions: (264000, 30)


In [9]:
xtrain[0]

'brief chat hope hick accidentally convinces president trump nazi heavily armed adam sandler get tackled secret service'

In [10]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)

xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xtest_tfidf = tfidf_vectorizer.transform(xtest)
print (xtrain_tfidf.shape, xtest_tfidf.shape)

(412000, 10000) (264000, 10000)


#### training

In [11]:
NB_model = GaussianNB()
NB_model.fit(xtrain_tfidf.toarray(), ytrain.ravel()) # N,1 -> N,

  y = column_or_1d(y, warn=True)


GaussianNB(priors=None, var_smoothing=1e-09)

#### prediction

In [12]:
y_predict_nb = NB_model.predict(xtest_tfidf.toarray())

In [14]:
# with url_token
print(f"accuracy_score: {accuracy_score(ytest, y_predict_nb)}")
print(f"f1-macro: {f1_score(ytest, y_predict_nb, average='macro')}")

accuracy_score: 0.6679393939393939
f1-macro: 0.6501582179259409


In [14]:
print(f"accuracy_score: {accuracy_score(ytest, y_predict_nb)}")
print(f"f1-micro: {f1_score(ytest, y_predict_nb, average='micro')}")
print(f"f1-macro: {f1_score(ytest, y_predict_nb, average='macro')}")

accuracy_score: 0.6566212121212122
f1-micro: 0.6566212121212122
f1-macro: 0.6494156046860575


In [16]:
# import dill
# dill.dump_session('nb_naive_bayes.db')