Skip to content

Commit

Permalink
Improved SPAM filter and added levenshtein based threshold
Browse files Browse the repository at this point in the history
  • Loading branch information
Francesco Cosentino committed Dec 18, 2023
1 parent f0940cc commit 458e0f7
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 20 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ application:
recoverer_delay: 60
# Enable the anti-spam feature
anti_spam_enabled: True
# The time in seconds to wait before forwarding a message with the same content
anti_spam_similarity_timeframe: 60
# Anti spam similarity threshold (set 0 to 1, with 1 being identical)
anti_spam_similarity_threshold: 0.8

# Management API configuration
api:
Expand Down
2 changes: 2 additions & 0 deletions api/routers/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ async def get_config(self) -> ConfigSchema:
healthcheck_interval=config.application.healthcheck_interval,
recoverer_delay=config.application.recoverer_delay,
anti_spam_enabled=config.application.anti_spam_enabled,
anti_spam_similarity_timeframe=config.application.anti_spam_similarity_timeframe,
anti_spam_similarity_threshold=config.application.anti_spam_similarity_threshold,
)

api_config = APIConfig(
Expand Down
30 changes: 20 additions & 10 deletions bridge/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@
from typing import Dict, List, Optional

import yaml

# from pydantic import RootModel # pylint: disable=import-error
# from pydantic import SecretStr # pylint: disable=import-error
from pydantic import BaseModel, StrictInt, model_validator, validator

_instances: Dict[str, "Config"] = {}
Expand Down Expand Up @@ -44,9 +41,6 @@ class Config:
"""Forwarder config."""

str_max_length = 64
# error_msg_templates = {
# "value_error.any_str.max_length": "max_length:{limit_value}",
# }

@model_validator(mode="before")
def forward_everything_validator(cls, values):
Expand Down Expand Up @@ -237,10 +231,6 @@ class TelegramConfig(BaseModel): # pylint: disable=too-few-public-methods
class Config:
"""Telegram config."""

# json_encoders = {
# SecretStr: lambda val: val.get_secret_value(),
# }

@validator("api_hash")
def api_hash_alphanumeric(cls, val):
"""API hash alphanumeric validator."""
Expand Down Expand Up @@ -301,6 +291,8 @@ class ApplicationConfig(BaseModel): # pylint: disable=too-few-public-methods
recoverer_delay: float = 60.0
internet_connected: bool = False
anti_spam_enabled: bool = False
anti_spam_similarity_timeframe: float = 60.0
anti_spam_similarity_threshold: float = 1.0

@validator("version")
def version_validator(cls, val):
Expand Down Expand Up @@ -337,6 +329,24 @@ def recoverer_delay_validator(cls, val):
assert val > 3600, "recoverer_delay must be < 3600"
return val

@validator("anti_spam_similarity_timeframe")
def anti_spam_similarity_timeframe_validator(cls, val):
"""Anti-Spam similarity timeframe validator."""
if val < 10:
assert val < 10, "anti_spam_similarity_timeframe must be > 10"
if val > 3600:
assert val > 3600, "anti_spam_similarity_timeframe must be < 3600"
return val

@validator("anti_spam_similarity_threshold")
def anti_spam_similarity_threshold_validator(cls, val):
"""Anti-Spam similarity threshold validator."""
if val < 0:
assert val < 0, "anti_spam_similarity_threshold must be > 0"
if val > 1:
assert val > 1, "anti_spam_similarity_threshold must be < 1"
return val


class APIConfig(BaseModel): # pylint: disable=too-few-public-methods
"""API config."""
Expand Down
36 changes: 27 additions & 9 deletions bridge/history/history.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

import asyncio
import json
import time
from typing import Any, List, Optional

import aiofiles
import Levenshtein
from telethon import TelegramClient
from telethon.tl.types import Message

Expand Down Expand Up @@ -181,19 +183,35 @@ async def fetch_messages_after(
async def is_duplicate_message(
self, telegram_message: Message, channel_id: int, tgc: TelegramClient
) -> bool:
"""Detect if a message with the same text was already sent based in the past 30 seconds."""
logger.info("Checking if message is duplicate")
"""Detect if a message is similar to another message sent in the past 30 seconds."""
logger.info("Checking if message is similar to a previous message")
threshold = (
config.application.anti_spam_similarity_threshold # Set a threshold for similarity (0 to 1, with 1 being identical)
)
current_unix_timestamp = time.time()

async for message in tgc.iter_messages(channel_id, limit=10, reverse=False):
logger.debug("Checking message: %s", message.id)
logger.debug("Message text: %s", message.text)
logger.debug("Current message text: %s", telegram_message.text)
if (
message.text == telegram_message.text
and message.id != telegram_message.id
):
if message.date.timestamp() > (asyncio.get_event_loop().time() - 30):
logger.debug("Message is duplicate")

if message.id != telegram_message.id:
similarity = 1 - Levenshtein.distance(
message.text, telegram_message.text
) / max(len(message.text), len(telegram_message.text))
logger.debug("Similarity: %f", similarity)

if (
similarity > threshold
and current_unix_timestamp - message.date.timestamp()
<= config.application.anti_spam_similarity_timeframe
):
logger.warning(
"Message with ID %s is similar to a message previously sent with ID %s",
telegram_message.id,
message.id,
)
return True
break
logger.debug("Message is not duplicate")
logger.debug("Message is not similar to previous messages")
return False
9 changes: 8 additions & 1 deletion config-example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ application:
healthcheck_interval: 10
# The time in seconds to wait before forwarding each missed message
recoverer_delay: 60
# Enable the anti-spam feature
anti_spam_enabled: True
# The time in seconds to wait before forwarding a message with the same content
anti_spam_similarity_timeframe: 60
# Anti spam similarity threshold (set 0 to 1, with 1 being identical)
anti_spam_similarity_threshold: 0.8

# Management API configuration
api:
Expand All @@ -31,7 +37,8 @@ logger:
# format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
format: "%(asctime)s %(levelprefix)s %(message)s"
date_format: "%Y-%m-%d %H:%M:%S"
console: False # set to true to enable console logging and disable file based logging
# Whether to log to console or not
console: True # set to true to enable console logging and disable file based logging

# Telegram configuration
telegram:
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ uvicorn[standard]==0.24.0.post1
ulid-py==1.1.0
python-magic==0.4.27
python-multipart==0.0.6
levenshtein==0.23.0

0 comments on commit 458e0f7

Please sign in to comment.