In [1]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
data = pd.read_csv('train.csv') 
# Initialize stopwords, lemmatizer, and punctuation removal
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
punctuation_table = str.maketrans('', '', string.punctuation)

# Function to preprocess text robustly
def robust_preprocessing(text):
    # Lowercase the text
    text = text.lower()

    # Remove URLs, email addresses, and phone numbers
    text = re.sub(r'http\S+|www\S+|https\S+|[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+|\b\d{10,}\b', '', text)

    # Remove special characters and punctuation
    text = text.translate(punctuation_table)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords and non-alphabetic tokens
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a single string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

# Example usage on your dataset
data['cleaned_crimeaditionalinfo'] = data['crimeaditionalinfo'].astype(str).apply(robust_preprocessing)

# Display the cleaned data
data[['crimeaditionalinfo', 'cleaned_crimeaditionalinfo']].head()

# Save the cleaned dataset if needed


[nltk_data] Downloading package punkt to C:\Users\Karthick
[nltk_data]     Raja\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Karthick
[nltk_data]     Raja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Karthick
[nltk_data]     Raja\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


NameError: name 'df' is not defined

In [11]:
data = pd.read_csv('train.csv') 
data.to_csv('cleaned_train.csv', index=False)  # Save the cleaned data to a new CSV file


In [None]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel
import torch

# Load the preprocessed dataset
df = pd.read_csv('cleaned_train.csv')  # Load the cleaned data

# Fill missing values in 'cleaned_crime_info' with an empty string
df['crimeaditionalinfo'] = df['crimeaditionalinfo'].fillna('')

# ====================
# 1. TF-IDF with N-grams
# ====================
tfidf_vectorizer = TfidfVectorizer(
    max_features=3000,  # Reduce the number of features
    ngram_range=(1, 2),  # Reduce the n-gram range
    analyzer='word',
    min_df=10,           # Increase the minimum document frequency
    max_df=0.5,          # Reduce max document frequency to exclude common words
    stop_words='english'
)


# Fit and transform TF-IDF on the entire dataset
X_tfidf_ngram = tfidf_vectorizer.fit_transform(df['crimeaditionalinfo'])
X_tfidf_ngram_dense = X_tfidf_ngram.toarray()
print("TF-IDF feature shape:", X_tfidf_ngram_dense.shape)

# ====================
# 2. Word2Vec Embeddings
# ====================
nltk.download('punkt')
df['tokenized_text'] = df['crimeaditionalinfo'].apply(nltk.word_tokenize)

# Train Word2Vec model on the entire tokenized corpus
word2vec_model = Word2Vec(
    sentences=df['tokenized_text'],
    vector_size=300, window=5, sg=1, min_count=5, workers=4
)

# Function to generate Word2Vec feature vectors for each document
def get_word2vec_features(tokenized_text, model, vector_size=300):
    feature_vec = np.zeros((vector_size,), dtype="float32")
    n_words = 0
    for word in tokenized_text:
        if word in model.wv:
            n_words += 1
            feature_vec = np.add(feature_vec, model.wv[word])
    if n_words > 0:
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

# Apply Word2Vec embeddings to each document
X_word2vec = np.array([get_word2vec_features(tokens, word2vec_model, 300) for tokens in df['tokenized_text']])
print("Word2Vec feature shape:", X_word2vec.shape)

# ====================
# 3. BERT Embeddings
# ====================
# Load pre-trained BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings for each document
def get_bert_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Apply BERT embeddings to all documents
X_bert = np.array([get_bert_embeddings(text, bert_tokenizer, bert_model) for text in df['crimeaditionalinfo']])
print("BERT embeddings shape:", X_bert.shape)

# ====================
# 4. Concatenate All Features
# ====================
# Concatenate TF-IDF, Word2Vec, and BERT features
X_combined = np.hstack((X_tfidf_ngram_dense, X_word2vec, X_bert))
print("Combined feature shape:", X_combined.shape)

# Save combined features for future use
np.save('X_combined_features.npy', X_combined)


TF-IDF feature shape: (93686, 3000)


[nltk_data] Downloading package punkt to C:\Users\Karthick
[nltk_data]     Raja\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Word2Vec feature shape: (93686, 300)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl.metadata (8.2 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.0.5-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/24.0 MB 3.3 MB/s eta 0:00:08
   ---------------------------------------- 0.2/24.0 MB 1.7 MB/s eta 0:00:15
   ---------------------------------------- 0.2/24.0 MB 1.5 MB/s eta 0:00:16
    --------------------------------------- 0.3/24.0 MB 1.5 MB/s eta 0:00:17
    --------------------------------------- 0.5/24.0 MB 1.8 MB/s eta 0:00:14
    --------------------------------------- 0.6/24.0 MB 1.8 MB/s eta 0:00:13
   - -------------------------------------- 0.7/24.0 MB 1.9 MB/s eta 0:00:13
   - -------------------------------------- 0.9/24.0 MB 2.1 MB/


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: C:\Users\Karthick Raja\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
from googletrans import Translator
import time

# Load the dataset
file_path = 'train.csv'
data = pd.read_csv(file_path)

# Initialize the translator
translator = Translator()

# Function to translate text to English
def translate_to_english(text):
    try:
        # Detect and translate to English if necessary
        translation = translator.translate(text, src='auto', dest='en')
        return translation.text
    except Exception as e:
        print(f"Error translating: {e}")
        return text  # If an error occurs, return the original text

# Apply translation to the 'crimeaditionalinfo' column and store in new column
data['crimeaditionalinfo_english'] = data['crimeaditionalinfo'].apply(translate_to_english)

# Save the updated DataFrame to a new CSV file
data.to_csv('translated_train.csv', index=False)

print("Translation completed and saved to 'translated_train.csv'")


ImportError: cannot import name 'DEFAULT_CLIENT_SERVICE_URLS' from 'googletrans.constants' (C:\Users\Karthick Raja\AppData\Local\Programs\Python\Python311\Lib\site-packages\googletrans\constants.py)

In [2]:
pip install googletrans==4.0.0-rc1


Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Using cached httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Using cached httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googletrans==4.0.0-rc1)
  Using cached h11-0.9.0-py2.py3-none-any.whl.metadata (8.1 kB)
Using cached httpx-0.13.3-py3-none-any.whl (55 kB)
Using cached httpcore-0.9.1-py3-none-any.whl (42 kB)
Using cached h11-0.9.0-py2.py3-none-any.whl (53 kB)
Building wheels for collected packages: googletrans
  Building wheel for googletrans (setup.py): started
  Building wheel for googletrans (setup.py): finished with status 'done'
  Created wheel for googletrans: filename=googletrans-4.0.0rc1-py3-none-an

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datallm 0.2.6 requires httpx>=0.25.1, but you have httpx 0.13.3 which is incompatible.
mostlyai 0.3.13 requires httpx>=0.25.0, but you have httpx 0.13.3 which is incompatible.

[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: C:\Users\Karthick Raja\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [4]:
pip install deep-translator


Collecting deep-translatorNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: C:\Users\Karthick Raja\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip



  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
   ---------------------------------------- 0.0/42.3 kB ? eta -:--:--
   ---------------------------------------- 42.3/42.3 kB 2.0 MB/s eta 0:00:00
Installing collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [4]:
import pandas as pd
from deep_translator import GoogleTranslator
import time

# Load the dataset
file_path = 'sample.csv'
data = pd.read_csv(file_path)

# Function to translate text to English
def translate_to_english(text):
    try:
        # Translate text to English
        translation = GoogleTranslator(source='auto', target='en').translate(text)
        return translation
    except Exception as e:
        print(f"Error translating: {e}")
        return text  # If an error occurs, return the original text

# Apply translation to the 'crimeaditionalinfo' column and store in new column
data['crimeaditionalinfo_english'] = data['crimeaditionalinfo'].apply(translate_to_english)

# Save the updated DataFrame to a new CSV file
data.to_csv('translated_train.csv', index=False)

print("Translation completed and saved to 'translated_train.csv'")


Translation completed and saved to 'translated_train.csv'


In [6]:
def translate_to_english(text):
    try:
        # Translate text to English
        translation = GoogleTranslator(source='auto', target='en').translate(text)
        return translation
    except Exception as e:
        print(f"Error translating: {e}")
        return text  # If an error occurs, return the original text

In [8]:
translate_to_english("Call karke bola ki aapka lotary laga ha aru AC no maga aru atm ka numbers aru phir bola pasa aayega lekin pasa nahi aaya uskebad bola ki is  par thousand bhej do to aajayega to hum ne pasa bhej diya phir pata chala ki who fraud tha")

'He called and said that you have won a lottery and asked for the AC number and the ATM number and then said that you will get the money but I did not get the money, after that he said if you send a thousand rupees for this then you will get it, so I sent the money, then I found out who was the fraud.'

In [9]:
import pandas as pd
from deep_translator import GoogleTranslator
import time

# Load the dataset
file_path = 'train.csv'
data = pd.read_csv(file_path)

# Function to translate text to English with error handling
def translate_to_english(text):
    if pd.isna(text):  # Skip NaN values
        return text
    if len(text) > 5000:  # Trim text if too long
        text = text[:5000]
    try:
        translation = GoogleTranslator(source='auto', target='en').translate(text)
        return translation
    except Exception as e:
        print(f"Error translating: {e}")
        return text  # Return original text on failure

# Apply translation with retry for connection issues
def translate_with_retry(text, retries=3, delay=2):
    for attempt in range(retries):
        try:
            return translate_to_english(text)
        except Exception as e:
            print(f"Retry {attempt+1}/{retries} failed: {e}")
            time.sleep(delay)  # Wait before retrying
    return text  # Return original if all retries fail

# Apply translation to 'crimeaditionalinfo' and store in new column
data['crimeaditionalinfo_english'] = data['crimeaditionalinfo'].apply(translate_with_retry)

# Save the updated DataFrame to a new CSV file
data.to_csv('translated_train.csv', index=False)

print("Translation completed and saved to 'translated_train.csv'")


Error translating: ('Connection aborted.', ConnectionAbortedError(10053, 'An established connection was aborted by the software in your host machine', None, 10053, None))
Error translating: HTTPSConnectionPool(host='translate.google.com', port=443): Max retries exceeded with url: /m?tl=en&sl=auto&q=He+asked+for+payment+on+whatsapp+and+asked+whether+its+confirmed+I+said+yes+then+he+blocked+me+from+whatsapp+I+had+sent+money+to+him+and+did+not+get+the+money+back+Sent+money+via+paytm+On+website+after+processing+payment+for+product+it+said+your+order+is+cancelled+without+telling+me+reason+or+what+shall+i+do+next (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000125736A9450>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error translating: HTTPSConnectionPool(host='translate.google.com', port=443): Max retries exceeded with url: /m?tl=en&sl=auto&q=Online+Frauds+++Loan+Fraud++Loan+Apps%0D%0AThe+victim+downloaded+a+loan+application+

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', ConnectionAbortedError(10053, 'An established connection was aborted by the software in your host machine', None, 10053, None))


KeyboardInterrupt: 

In [1]:
import pandas as pd
import requests
import time

# Load the dataset
file_path = 'train.csv'
data = pd.read_csv(file_path)

# URL for the LibreTranslate API
LIBRETRANSLATE_URL = "http://localhost:5000/translate"

# Function to translate text to English using LibreTranslate with error handling
def translate_to_english(text):
    if pd.isna(text):  # Skip NaN values
        return text
    if len(text) > 5000:  # Trim text if too long
        text = text[:5000]
    try:
        response = requests.post(
            LIBRETRANSLATE_URL,
            json={'q': text, 'source': 'auto', 'target': 'en'}
        )
        response.raise_for_status()
        translation = response.json().get('translatedText', text)
        return translation
    except Exception as e:
        print(f"Error translating: {e}")
        return text  # Return original text on failure

# Apply translation with retry for connection issues
def translate_with_retry(text, retries=3, delay=2):
    for attempt in range(retries):
        try:
            return translate_to_english(text)
        except Exception as e:
            print(f"Retry {attempt+1}/{retries} failed: {e}")
            time.sleep(delay)  # Wait before retrying
    return text  # Return original if all retries fail

# Apply translation to 'crimeaditionalinfo' and store in new column
data['crimeaditionalinfo_english'] = data['crimeaditionalinfo'].apply(translate_with_retry)

# Save the updated DataFrame to a new CSV file
data.to_csv('translated_train.csv', index=False)

print("Translation completed and saved to 'translated_train.csv'")


Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error translating: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without resp

KeyboardInterrupt: 

In [1]:
import pandas as pd
import nltk
from nltk.corpus import words

# Download the English words dictionary if needed
nltk.download('words')
english_words = set(words.words())

# Load the original CSV file
file_path = 'train.csv'  # Replace with the actual file path
data = pd.read_csv(file_path)

# Function to check if a sentence contains any non-English word
def contains_non_english(sentence):
    if not isinstance(sentence, str):
        return False
    
    # Split the sentence into words and check if any word is non-English
    words_in_sentence = sentence.split()
    for word in words_in_sentence:
        if word.lower() not in english_words:
            return True  # Found a non-English word
    return False

# Filter rows that contain any non-English words in 'crimeaditionalinfo' column
non_english_rows = data[data['crimeaditionalinfo'].apply(contains_non_english)]

# Remove duplicate rows if any
non_english_rows = non_english_rows.drop_duplicates()

# Save non-English rows to a new CSV file
non_english_file_path = 'non_english_crime_info.csv'  # Save path
non_english_rows.to_csv(non_english_file_path, index=False)

print(f"Rows with non-English words saved to {non_english_file_path}")


[nltk_data] Downloading package words to C:\Users\Karthick
[nltk_data]     Raja\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


Rows with non-English words saved to non_english_crime_info.csv


In [2]:
import pandas as pd
data = pd.read_csv('non_english_crime_info.csv')
data = data.drop_duplicates()
data.to_csv('non_english_cleaned.csv', index=False)


In [3]:
misaligned_rows = data[data['category'].apply(lambda x: isinstance(x, str) and 'expected_subcategory' in x)]
print(misaligned_rows)


Empty DataFrame
Columns: [category, sub_category, crimeaditionalinfo]
Index: []


In [None]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
data = pd.read_csv('sample.csv') 
# Initialize stopwords, lemmatizer, and punctuation removal
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
punctuation_table = str.maketrans('', '', string.punctuation)

# Function to preprocess text robustly
def robust_preprocessing(text):
    # Lowercase the text
    text = text.lower()

    # Remove URLs, email addresses, and phone numbers
    text = re.sub(r'http\S+|www\S+|https\S+|[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+|\b\d{10,}\b', '', text)

    # Remove special characters and punctuation
    text = text.translate(punctuation_table)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords and non-alphabetic tokens
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a single string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

# Example usage on your dataset
data['cleaned_crimeaditionalinfo'] = data['crimeaditionalinfo_english'].astype(str).apply(robust_preprocessing)

# Display the cleaned data
data[['crimeaditionalinfo_english', 'cleaned_crimeaditionalinfo']].head()

# Save the cleaned dataset if needed


In [6]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
import re

# Load training data
train_data = pd.read_csv('translated_train.csv')

# Function for basic text preprocessing
def simple_preprocess_text(text):
    # Check if the text is valid
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters
        tokens = [word for word in text.split() if len(word) > 1]  # Filter out single-letter words
        return ' '.join(tokens)
    else:
        return ''  # Return empty string for invalid entries

# Apply preprocessing
train_data['processed_crime_info'] = train_data['crimeaditionalinfo_english'].apply(simple_preprocess_text)

# Map category labels to integers
train_data['category_label'] = train_data['category'].factorize()[0]
category_mapping = dict(enumerate(train_data['category'].factorize()[1]))

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data['processed_crime_info'], train_data['category_label'], test_size=0.2, random_state=42
)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)

# Define a Dataset class for PyTorch
class CrimeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create Dataset objects for training and validation
train_dataset = CrimeDataset(train_encodings, list(train_labels))
val_dataset = CrimeDataset(val_encodings, list(val_labels))

# Load BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(category_mapping))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained('./crime_classification_model')
tokenizer.save_pretrained('./crime_classification_model')



Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch,Training Loss,Validation Loss
1,No log,1.573208
2,No log,1.531239
3,No log,1.519043


('./crime_classification_model\\tokenizer_config.json',
 './crime_classification_model\\special_tokens_map.json',
 './crime_classification_model\\vocab.txt',
 './crime_classification_model\\added_tokens.json')

In [8]:
### Load and Preprocess Test Data ###
test_data = pd.read_csv('testing.csv')
test_data['processed_crime_info'] = test_data['crimeaditionalinfo'].apply(simple_preprocess_text)
test_encodings = tokenizer(list(test_data['processed_crime_info']), truncation=True, padding=True, max_length=128)

# If test data has true labels, convert to integers as well
if 'category' in test_data.columns:
    test_data['category_label'] = test_data['category'].map(lambda x: category_mapping.get(x, -1))
    test_labels = test_data['category_label'].tolist()
else:
    test_labels = None  # No labels available for test set

# Create test dataset
test_dataset = CrimeDataset(test_encodings, test_labels if test_labels else [0] * len(test_data))

# Evaluate on the test set
test_results = trainer.evaluate(test_dataset)
print(f"Test Evaluation Results: {test_results}")

# Predict on the test set
test_outputs = trainer.predict(test_dataset)
test_predictions = torch.argmax(torch.tensor(test_outputs.predictions), dim=1)

# Map predictions back to category names
predicted_categories = [category_mapping[label.item()] for label in test_predictions]

# Add predictions to the test DataFrame
test_data['predicted_category'] = predicted_categories

# Display predictions
print(test_data[['crimeaditionalinfo', 'category', 'predicted_category']].head())

IndexError: Target -1 is out of bounds.

In [3]:
import pandas as pd

# Load the CSV file
file_path = 'top500.csv'
df = pd.read_csv(file_path)

# Display initial data info to understand structure
print("Initial Data Info:")
print(df.info())

# Constraint 1: If 'sub-category' is empty, check if 'category' is not one of the specified values
specified_categories = [
    'Women/Child Related Crime',
    'Financial Fraud Crimes',
    'Other Cyber Crime'
]

# Apply the constraint
df['sub-category'] = df.apply(
    lambda row: row['category'] if pd.isna(row['sub_category']) and row['category'] not in specified_categories else row['sub_category'],
    axis=1
)

# Display changes
print("Updated Data Preview:")
print(df.head())

# Save the modified DataFrame to a new CSV file
output_file_path = 'top500_updated.csv'
df.to_csv(output_file_path, index=False)
print(f"Updated file saved to {output_file_path}")


Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   category            500 non-null    object
 1   sub_category        463 non-null    object
 2   crimeaditionalinfo  500 non-null    object
dtypes: object(3)
memory usage: 11.8+ KB
None
Updated Data Preview:
                                category                       sub_category  \
0  Online and Social Media Related Crime  Cyber Bullying  Stalking  Sexting   
1                 Online Financial Fraud                  Fraud CallVishing   
2               Online Gambling  Betting           Online Gambling  Betting   
3  Online and Social Media Related Crime                   Online Job Fraud   
4                 Online Financial Fraud                  Fraud CallVishing   

                                  crimeaditionalinfo  \
0  I had continue received random call

In [6]:
df['category'].unique()

array(['Online and Social Media Related Crime', 'Online Financial Fraud',
       'Online Gambling  Betting',
       'RapeGang Rape RGRSexually Abusive Content',
       'Any Other Cyber Crime', 'Cyber Attack/ Dependent Crimes',
       'Cryptocurrency Crime', 'Sexually Explicit Act',
       'Sexually Obscene material',
       'Hacking  Damage to computercomputer system etc',
       'Cyber Terrorism',
       'Child Pornography CPChild Sexual Abuse Material CSAM',
       'Online Cyber Trafficking'], dtype=object)

In [10]:
df['sub_category'].unique()

array(['Cyber Bullying  Stalking  Sexting', 'Fraud CallVishing',
       'Online Gambling  Betting', 'Online Job Fraud',
       'UPI Related Frauds', 'Internet Banking Related Fraud', nan,
       'Other', 'Profile Hacking Identity Theft',
       'DebitCredit Card FraudSim Swap Fraud', 'EWallet Related Fraud',
       'Data Breach/Theft', 'Cheating by Impersonation',
       'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks',
       'FakeImpersonating Profile', 'Cryptocurrency Fraud',
       'Malware Attack', 'Business Email CompromiseEmail Takeover',
       'Email Hacking', 'Hacking/Defacement',
       'Unauthorised AccessData Breach', 'SQL Injection',
       'Provocative Speech for unlawful acts', 'Ransomware Attack',
       'Cyber Terrorism', 'Tampering with computer source documents',
       'DematDepository Fraud', 'Online Trafficking',
       'Online Matrimonial Fraud'], dtype=object)