In [1]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelBinarizer

import plotly.graph_objects as go

In [11]:
df_train = pd.read_pickle("./pan19_df_clean_train_full_features.pkl")
df_test = pd.read_pickle("./pan19_df_clean_test_full_features.pkl")

# PAZI NA OVO, zakomentirati prije pravog pokretanja
#num_examples = 20_000
#df_train = df_train.loc[list(range(5*num_examples)), :]
#df_test = df_test.loc[list(range(num_examples)), :]

print(f"train size: {df_train.shape}, test size: {df_test.shape}")

train size: (412000, 20), test size: (264000, 20)


In [3]:
word_features = ["word_count", "char_count", "word_density", "total_length", "capitals", "caps_vs_length"]
punctuation_features = ["num_exclamation_marks", "num_question_marks", "num_punctuation", "num_symbols"]
uniques_features = ["num_unique_words", "words_vs_unique", "word_unique_percent"]
means_features = ["num_retweet", "num_url", "num_number"]

all_new_features = word_features + punctuation_features + uniques_features + means_features

In [4]:
multilabel_binarizer = LabelBinarizer()
multilabel_binarizer.fit(df_train['bot'])
print(f"labels: {list(multilabel_binarizer.classes_)}")
print()

ytrain = multilabel_binarizer.transform(df_train['bot'])
ytest = multilabel_binarizer.transform(df_test['bot'])
print(f"train label dimensions: {len(ytrain), len(ytrain[0])}")
print(f"test label dimensions: {len(ytest), len(ytest[0])}")

xtrain=df_train.clean_tweet
xtest=df_test.clean_tweet
print(f"train dimensions: {len(xtrain), len(xtrain[0])}")
print(f"test dimensions: {len(xtest), len(xtest[0])}")

labels: ['bot', 'human']

train label dimensions: (100000, 1)
test label dimensions: (20000, 1)
train dimensions: (100000, 220)
test dimensions: (20000, 24)


In [5]:
# Koeficijenti:
tfidf_max_df = 0.8
tfidf_max_features = 10000
tfidf_ngram_range = (1,3)

k_features = 500    # ANOVA feature count
clf_random = 31337

In [6]:
tfidf_vectorizer = TfidfVectorizer(
    max_df=tfidf_max_df, max_features=tfidf_max_features, ngram_range = tfidf_ngram_range)

xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xtest_tfidf = tfidf_vectorizer.transform(xtest)
print (f"train shape: {xtrain_tfidf.shape}\ntest shape: {xtest_tfidf.shape}")

X_train, X_test, y_train, y_test = xtrain_tfidf, xtest_tfidf, ytrain, ytest

print()
print(X_train.shape)
X_train_new = SelectKBest(chi2, k=20).fit_transform(X_train, y_train)
print(X_train_new.shape)
print(type(X_train_new))
print(type(X_train_new.toarray()))
print(X_train_new.toarray().shape)

train shape: (100000, 10000)
test shape: (20000, 10000)

(100000, 10000)
(100000, 20)
<class 'scipy.sparse.csr.csr_matrix'>
<class 'numpy.ndarray'>
(100000, 20)


In [7]:
## Klasifikacija bez hand-crafted featurea

from sklearn import svm
# from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest, chi2, f_regression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

print(__doc__)

X_train, X_test, y_train, y_test = xtrain_tfidf, xtest_tfidf, ytrain, ytest

# ANOVA SVM-C
# 1) anova filter, take 3 best ranked features
# 2) svm

anova_filter = SelectKBest(f_regression, k=k_features)
clf = svm.LinearSVC(max_iter=5000, dual=False, random_state=clf_random)

anova_svm = make_pipeline(anova_filter, clf)
anova_svm.fit(X_train.toarray(), y_train.ravel())


y_pred = anova_svm.predict(X_test)
print(classification_report(y_test, y_pred))

coef = anova_svm[:-1].inverse_transform(anova_svm['linearsvc'].coef_)
print(coef)

Automatically created module for IPython interactive environment
              precision    recall  f1-score   support

           0       0.75      0.78      0.76     10000
           1       0.77      0.74      0.76     10000

    accuracy                           0.76     20000
   macro avg       0.76      0.76      0.76     20000
weighted avg       0.76      0.76      0.76     20000

[[0. 0. 0. ... 0. 0. 0.]]


In [8]:
# no mentions, hashtags, urls, numbers
print(f"accuracy_score: {accuracy_score(ytest, y_pred)}")
print(f"f1-macro: {f1_score(ytest, y_pred, average='macro')}")

accuracy_score: 0.76
f1-macro: 0.7599362174536153


In [9]:
import scipy as sp

#print(xtrain_tfidf.todense().shape)
#print(df_train_full[all_new_features].values.shape)
#print(xtrain_tfidf.todense())
#print(df_train_full[all_new_features].values)
#xtrain_dense = xtrain_tfidf.todense()
#xtest_dense = xtest_tfidf.todense()

X_train_full = sp.sparse.hstack((xtrain_tfidf, sp.sparse.csr_matrix(df_train[all_new_features].values)), "csr")
X_test_full = sp.sparse.hstack((xtest_tfidf, sp.sparse.csr_matrix(df_test[all_new_features].values)), "csr")
print (f"train shape: {X_train_full.shape}\ntest shape: {X_test_full.shape}")
print(type(X_train_full))
print(X_train_full[1])
print(X_train_full[1].shape)

train shape: (100000, 10016)
test shape: (20000, 10016)
<class 'scipy.sparse.csr.csr_matrix'>
  (0, 231)	0.1070060091441348
  (0, 244)	0.1329811514246474
  (0, 492)	0.14275085349911712
  (0, 965)	0.0946049494358762
  (0, 984)	0.15361995118469748
  (0, 985)	0.16269323971366992
  (0, 1312)	0.16642715988597387
  (0, 1822)	0.10859843992917596
  (0, 1829)	0.1396898145391235
  (0, 1830)	0.1403264374150171
  (0, 1952)	0.09513522262420086
  (0, 2387)	0.12139924566861027
  (0, 2388)	0.1457718014524301
  (0, 2389)	0.14713823460049275
  (0, 2621)	0.10014302858402735
  (0, 2631)	0.1457718014524301
  (0, 2632)	0.1457718014524301
  (0, 2637)	0.13001022806493032
  (0, 2638)	0.16269323971366992
  (0, 3030)	0.09858266331338617
  (0, 3036)	0.11112700913670026
  (0, 3038)	0.14533846197443526
  (0, 4015)	0.16146262086256608
  (0, 4024)	0.10212074040138394
  (0, 4064)	0.1369401189789827
  :	:
  (0, 8753)	0.13762395491559562
  (0, 8910)	0.13153348473043952
  (0, 9028)	0.078053783936493
  (0, 9039)	0.1453384

In [10]:
anova_filter = SelectKBest(f_regression, k=k_features)
clf = svm.LinearSVC(max_iter=5000, dual=False, random_state=clf_random)

anova_svm = make_pipeline(anova_filter, clf)
anova_svm.fit(X_train_full, y_train.ravel())

y_pred = anova_svm.predict(X_test_full)
print(classification_report(y_test, y_pred))
print(f"accuracy_score: {accuracy_score(ytest, y_pred)}")
print(f"f1-macro: {f1_score(ytest, y_pred, average='macro')}")

              precision    recall  f1-score   support

           0       0.77      0.78      0.77     10000
           1       0.77      0.76      0.77     10000

    accuracy                           0.77     20000
   macro avg       0.77      0.77      0.77     20000
weighted avg       0.77      0.77      0.77     20000

accuracy_score: 0.76985
f1-macro: 0.7698436563157771
