In [155]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestCentroid
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [156]:
data = pd.read_json('chat-export.json')
data = pd.json_normalize(data['messages'])

data = data.drop(data.columns.difference(['content.body', 'sender']), axis=1)
data = data.dropna()

np.savetxt('messages.txt', data.to_numpy(), fmt='%s')

data

Unnamed: 0,sender,content.body
5,@joey_:matrix.org,ich küss eure augen mashallah
6,@sonnentod:matrix.org,image.png
7,@sonnentod:matrix.org,"TLDR: spaces = server, rooms = server channels"
8,@sonnentod:matrix.org,element ist der normie client der aber gut fun...
9,@sonnentod:matrix.org,kannst alle möglichen funktionen durch addons ...
...,...,...
11039,@sonnentod:matrix.org,muss iwie alle restrictions umgehen
11040,@sonnentod:matrix.org,F-droid ist ja mal insane
11041,@sonnentod:matrix.org,da sind richtig coole apps drauf
11042,@sonnentod:matrix.org,NewPipe zB ist eine yt app auf der man zB auch...


In [163]:
# count vocabulary, better method below.

# data['words'] = data['content.body'].str.split()
# words = np.concatenate(data['words'].values)
# unique_words = np.unique(words)
# word_count = len(unique_words)
# word_count

In [161]:
train, test = train_test_split(data)

pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1, 2))),
    ('clf', NearestCentroid())
])

train_X = train['content.body'].to_numpy()
train_y = train['sender'].to_numpy()

test_X = test['content.body'].to_numpy()
test_y = test['sender'].to_numpy()

pipeline.fit(train_X, train_y)


In [159]:
# predictions = pipeline.predict(np.array(['''ah nice hatte das damals auf dem alten handy, aber die haben jetzt ja iwie legal stress bekommen und ist schwer zu finden7
# schick plz
# probiere jetzt "SchildiChat" auf dem handy für matrix''']))
predictions = pipeline.predict(test_X)
print(np.count_nonzero(predictions == '@sonnentod:matrix.org'))
print(np.count_nonzero(predictions == '@moneo:matrix.org'))
print(np.count_nonzero(predictions == '@joey_:matrix.org'))
print(np.count_nonzero(predictions == '@JWS:matrix.org'))
np.savetxt('predictions.txt', predictions, fmt='%s')

133
2137
115
0


In [162]:
# test_y
print(classification_report(predictions, test_y))
vocab = pipeline.named_steps['vectorizer'].vocabulary_.keys()
print(len(vocab))
with open('vocab.txt', 'w') as file:
    # Write the string representation of the dictionary to the file
    file.write(str(vocab))

                         precision    recall  f1-score   support

        @JWS:matrix.org       0.00      0.00      0.00         0
@demonzocker:matrix.org       0.13      0.10      0.11       352
      @joey_:matrix.org       0.04      0.10      0.06       115
      @moneo:matrix.org       0.79      0.28      0.41      2137
  @sonnentod:matrix.org       0.05      0.49      0.08       133

               accuracy                           0.26      2737
              macro avg       0.20      0.19      0.13      2737
           weighted avg       0.64      0.26      0.34      2737

49904


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
