In [10]:
import pandas as pd
from googletrans import Translator

# Load your data
data = pd.read_csv('C:/Users/hr893/Downloads/top500_updated.csv')

# Initialize translator
translator = Translator()

# Define a function to translate text
def translate_text(text):
    if pd.isna(text):  # Check if the entry is None or NaN
        return ""
    try:
        # Translate text to English
        return translator.translate(text, dest='en').text
    except Exception as e:
        print(f"Error translating text: {e}")
        return text  # Return original if translation fails

# Apply the translation function to the "crimeadditionalinfo" column
data['crimeaditionalinfo'] = data['crimeaditionalinfo'].apply(translate_text)

# Save the updated dataframe
data.to_csv('D:/Downloads/sample500_english2.csv', index=False)
print("Translation complete. The file has been saved as 'translated_sample100.csv'.")

Error translating text: sequence item 0: expected str instance, NoneType found



KeyboardInterrupt



In [11]:
import pandas as pd
import re
from nltk.corpus import stopwords

# Load the dataset
data = pd.read_csv('D:/Downloads/sample500_english1.csv')

# Download stopwords if you haven't done so
import nltk
nltk.download('stopwords')

# Create a set of English stopwords
stop_words = set(stopwords.words('english'))

# Function for advanced text preprocessing
def advanced_preprocess_text(text):
    # Check if the text is valid
    if isinstance(text, str):
        # Lowercase the text
        text = text.lower()
        # Remove URLs
        text = re.sub(r'http[s]?://\S+|www\.\S+', '', text)
        # Remove email addresses
        text = re.sub(r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b', '', text)
        # Remove phone numbers
        text = re.sub(r'\+?\d[\d -]{8,12}\d', '', text)  # Adjust regex for phone numbers as necessary
        # Remove special characters and numbers, keeping only alphabetic characters
        text = re.sub(r'[^a-z\s]', '', text)
        # Tokenize and filter out short words and stopwords
        tokens = [word for word in text.split() if len(word) > 1 and word not in stop_words]
        # Join tokens back to a single string
        return ' '.join(tokens)
    else:
        return ''  # Return an empty string for non-text entries

# Apply the advanced preprocessing function
data['processed_crime_info'] = data['crimeaditionalinfo'].apply(advanced_preprocess_text)

# Display the first few rows to check the preprocessing
print(data[['crimeaditionalinfo', 'processed_crime_info']].head())
data.to_csv('D:/Downloads/sample500_english2.csv', index=False)

                                  crimeaditionalinfo  \
0  I had continue received random calls and abusi...   
1  The above fraudster is continuously messaging ...   
2  He is acting like a police and demanding for m...   
3  In apna Job I have applied for job interview f...   
4  I received a call from lady stating that she w...   

                                processed_crime_info  
0  continue received random calls abusive message...  
1  fraudster continuously messaging asking pay mo...  
2  acting like police demanding money adding sect...  
3  apna job applied job interview telecalling res...  
4  received call lady stating send new phone vivo...  


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hr893\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Load dataset
data = pd.read_csv('D:/Downloads/sample500_english2.csv')

# Select relevant columns and drop any NaN rows
data = data[['processed_crime_info', 'category', 'sub-category']].dropna()



In [13]:
# Encode labels
label_encoder_cat = LabelEncoder()
label_encoder_subcat = LabelEncoder()
data['category_label'] = label_encoder_cat.fit_transform(data['category'])
data['sub_category_label'] = label_encoder_subcat.fit_transform(data['sub-category'])



In [15]:
# Split data
X_train, X_test, y_train_cat, y_test_cat, y_train_subcat, y_test_subcat = train_test_split(
    data['processed_crime_info'], 
    data['category_label'], 
    data['sub_category_label'], 
    test_size=0.2, 
    random_state=42
)



In [16]:


# Dataset class for PyTorch DataLoader
class CrimeDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        inputs = self.tokenizer(text, padding='max_length', truncation=True, return_tensors="pt", max_length=128)
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
        label = torch.tensor(self.labels[idx])
        inputs['labels'] = label
        return inputs



In [17]:
# Load BERT models for sequence classification (one for each target)
model_cat = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder_cat.classes_))
model_subcat = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder_subcat.classes_))



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
)



In [20]:
# Metric function for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}



In [25]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train_cat = y_train_cat.reset_index(drop=True).tolist()
y_test_cat = y_test_cat.reset_index(drop=True).tolist()
y_train_subcat = y_train_subcat.reset_index(drop=True).tolist()
y_test_subcat = y_test_subcat.reset_index(drop=True).tolist()

In [28]:
# Prepare the datasets
train_dataset_cat = CrimeDataset(X_train, y_train_cat)
test_dataset_cat = CrimeDataset(X_test, y_test_cat)
train_dataset_subcat = CrimeDataset(X_train, y_train_subcat)
test_dataset_subcat = CrimeDataset(X_test, y_test_subcat)



In [29]:
# Trainer for category
trainer_cat = Trainer(
    model=model_cat,
    args=training_args,
    train_dataset=train_dataset_cat,
    eval_dataset=test_dataset_cat,
    compute_metrics=compute_metrics
)

# Trainer for sub_category
trainer_subcat = Trainer(
    model=model_subcat,
    args=training_args,
    train_dataset=train_dataset_subcat,
    eval_dataset=test_dataset_subcat,
    compute_metrics=compute_metrics
)



In [30]:
# Training the models
print("Training Category Model...")
trainer_cat.train()
print("Training Sub-category Model...")
trainer_subcat.train()

# Evaluate the models
print("\nCategory classification evaluation:")
cat_eval_results = trainer_cat.evaluate()
print(cat_eval_results)

print("\nSub-category classification evaluation:")
subcat_eval_results = trainer_subcat.evaluate()
print(subcat_eval_results)


Training Category Model...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.18844,0.646465,0.508608
2,No log,1.071212,0.747475,0.674196
3,No log,1.056067,0.686869,0.646713
4,No log,1.278542,0.676768,0.655129
5,No log,1.310532,0.717172,0.670096


Training Sub-category Model...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,2.547632,0.262626,0.176153
2,No log,2.098235,0.40404,0.336918
3,No log,2.007906,0.474747,0.421667
4,No log,1.93438,0.454545,0.409056
5,No log,1.908747,0.515152,0.462578



Category classification evaluation:


{'eval_loss': 1.3105322122573853, 'eval_accuracy': 0.7171717171717171, 'eval_f1': 0.6700962325962326, 'eval_runtime': 20.7079, 'eval_samples_per_second': 4.781, 'eval_steps_per_second': 1.207, 'epoch': 5.0}

Sub-category classification evaluation:


{'eval_loss': 1.908746600151062, 'eval_accuracy': 0.5151515151515151, 'eval_f1': 0.46257753938233875, 'eval_runtime': 20.2321, 'eval_samples_per_second': 4.893, 'eval_steps_per_second': 1.236, 'epoch': 5.0}


In [31]:
# Save the models
model_cat.save_pretrained('D:/Downloads/category_model')
model_subcat.save_pretrained('D:/Downloads/subcategory_model')

# Save the label encoders
import joblib

joblib.dump(label_encoder_cat, 'category_label_encoder.pkl')
joblib.dump(label_encoder_subcat, 'subcategory_label_encoder.pkl')

print("Models and label encoders saved successfully.")


Models and label encoders saved successfully.


In [37]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import joblib

# Load the models
model_cat = BertForSequenceClassification.from_pretrained('D:/Downloads/category_model')
model_subcat = BertForSequenceClassification.from_pretrained('D:/Downloads/subcategory_model')

# Load the label encoders
label_encoder_cat = joblib.load('category_label_encoder.pkl')
label_encoder_subcat = joblib.load('subcategory_label_encoder.pkl')

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to preprocess input text
def preprocess_text(text):
    text = text.lower()  # Lowercase the text
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and numbers
    tokens = [word for word in text.split() if len(word) > 1]  # Filter out short words
    return ' '.join(tokens)  # Join tokens back to a single string

# Input text
input_text = """
He was called from Maharashtra and created fake video of mine and demanding money from me or else saying that he will post that fake video in internet facebook youtube telegram He demanding many thousands of rupees

"""

# Preprocess the input text
processed_text = preprocess_text(input_text)

# Tokenize the input text
inputs = tokenizer(processed_text, padding='max_length', truncation=True, return_tensors='pt', max_length=128)

# Make predictions for category
model_cat.eval()
with torch.no_grad():
    category_logits = model_cat(**inputs).logits

# Make predictions for sub-category
model_subcat.eval()
with torch.no_grad():
    subcategory_logits = model_subcat(**inputs).logits

# Get predicted category and sub-category labels
predicted_category_index = torch.argmax(category_logits, dim=1).item()
predicted_subcategory_index = torch.argmax(subcategory_logits, dim=1).item()

# Map indices back to labels
predicted_category = label_encoder_cat.inverse_transform([predicted_category_index])[0]
predicted_subcategory = label_encoder_subcat.inverse_transform([predicted_subcategory_index])[0]

# Print the predictions
print(f"Predicted Category: {predicted_category}")
print(f"Predicted Sub-category: {predicted_subcategory}")


Predicted Category: Online and Social Media Related Crime
Predicted Sub-category: Cyber Bullying  Stalking  Sexting


In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
import nltk
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('top500_updated.csv')

nltk.download('stopwords')

# Set of stop words in English
stop_words = set(stopwords.words('english'))
# Preprocessing
# Drop rows with missing values in essential columns (e.g., 'crimeaditionalinfo')
data.dropna(subset=['crimeaditionalinfo', 'category', 'sub-category'], inplace=True)

# Text cleaning function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters and numbers
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply text cleaning
data['crimeaditionalinfo'] = data['crimeaditionalinfo'].apply(clean_text)

# Encode categorical labels
label_encoder_category = LabelEncoder()
label_encoder_sub_category = LabelEncoder()
data['category'] = label_encoder_category.fit_transform(data['category'])
data['sub-category'] = label_encoder_sub_category.fit_transform(data['sub-category'])

# Split dataset into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Initialize the tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define function to tokenize text data
def tokenize_text(data, max_length=128):
    return tokenizer(
        data['crimeaditionalinfo'].tolist(),
        max_length=max_length,
        truncation=True,
        padding=True,
        return_tensors="tf"
    )

# Tokenize the training and validation data
train_encodings = tokenize_text(train_data)
val_encodings = tokenize_text(val_data)

# Prepare labels
train_labels_category = train_data['category'].values
train_labels_sub_category = train_data['sub-category'].values
val_labels_category = val_data['category'].values
val_labels_sub_category = val_data['sub-category'].values

# Compute class weights for handling class imbalance
category_class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_labels_category),
    y=train_labels_category
)
sub_category_class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_labels_sub_category),
    y=train_labels_sub_category
)

category_class_weights_dict = dict(enumerate(category_class_weights))
sub_category_class_weights_dict = dict(enumerate(sub_category_class_weights))

# Load the BERT model for category and sub-category classification
num_category_labels = len(np.unique(train_labels_category))
num_sub_category_labels = len(np.unique(train_labels_sub_category))

category_model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_category_labels)
sub_category_model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_sub_category_labels)

# Compile the models with Adam optimizer and sparse categorical crossentropy
optimizer = Adam(learning_rate=2e-5)  # Adjust learning rate if needed

category_model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

sub_category_model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the category model
print("Training Category Model...")
history_category = category_model.fit(
    train_encodings['input_ids'], train_labels_category,
    validation_data=(val_encodings['input_ids'], val_labels_category),
    batch_size=16,  # Adjust based on your GPU capacity
    epochs=5,
    class_weight=category_class_weights_dict,
    callbacks=[early_stopping]
)

# Train the sub-category model
print("Training Sub-Category Model...")
history_sub_category = sub_category_model.fit(
    train_encodings['input_ids'], train_labels_sub_category,
    validation_data=(val_encodings['input_ids'], val_labels_sub_category),
    batch_size=16,  # Adjust based on your GPU capacity
    epochs=5,
    class_weight=sub_category_class_weights_dict,
    callbacks=[early_stopping]
)

# Evaluate the category model
print("Evaluating Category Model...")
category_eval = category_model.evaluate(val_encodings['input_ids'], val_labels_category)
print("Category Model Evaluation:", category_eval)

# Evaluate the sub-category model
print("Evaluating Sub-Category Model...")
sub_category_eval = sub_category_model.evaluate(val_encodings['input_ids'], val_labels_sub_category)
print("Sub-Category Model Evaluation:", sub_category_eval)

# Print model evaluation metrics for each model
print("Final Category Model Evaluation:")
print(f"Loss: {category_eval[0]}, Accuracy: {category_eval[1]}")

print("Final Sub-Category Model Evaluation:")
print(f"Loss: {sub_category_eval[0]}, Accuracy: {sub_category_eval[1]}")


[nltk_data] Downloading package stopwords to C:\Users\Karthick
[nltk_data]     Raja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


RuntimeError: Failed to import transformers.models.bert.modeling_tf_bert because of the following error (look up to see its traceback):
No module named 'keras.engine'

In [6]:
pip install -U transformers tensorflow keras


^C
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
from transformers import MBartForConditionalGeneration, MBartTokenizer
from langdetect import detect, DetectorFactory

# Fix seed for reproducibility
DetectorFactory.seed = 0

# Load the mBART model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Function to translate text
def translate(text):
    try:
        # Detect the language of the input text
        detected_lang = detect(text)

        # Map detected language to mBART source language codes
        lang_mapping = {
            'en': 'en_XX',  # English
            'hi': 'hi_IN',  # Hindi
            'es': 'es_XX',  # Spanish
            'fr': 'fr_XX',  # French
            'de': 'de_DE',  # German
            'ja': 'ja_XX',  # Japanese
            # Add other mappings as necessary
        }

        # Check if the detected language is supported
        if detected_lang in lang_mapping:
            source_lang = lang_mapping[detected_lang]
        else:
            return f"Language '{detected_lang}' is not supported for translation."
        
        # Set the target language to English
        target_lang = 'en_XX'

        # Tokenize the input text
        inputs = tokenizer(text, return_tensors="pt", src_lang=source_lang)

        # Generate translation
        translated_tokens = model.generate(inputs["input_ids"], forced_bos_token_id=tokenizer.lang2id[target_lang])

        # Decode the translated tokens
        return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

    except Exception as e:
        return str(e)

# Load the CSV file
input_file_path = "top2000.csv"  # Replace with your input CSV file path
df = pd.read_csv(input_file_path)

# Ensure the CSV has the 'text' column for translation
if 'crimeaditionalinfo' not in df.columns:
    raise ValueError("CSV must contain a 'text' column.")

# Translate only the 'text' column
df['translated_text'] = df['crimeaditionalinfo'].apply(translate)

# Save the results to a new CSV file
output_file_path = "translated_texts.csv"  # Path for the output CSV file
df.to_csv(output_file_path, index=False)

print(f"Translations saved to {output_file_path}.")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


In [10]:
pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     - ----------------------------------- 30.7/981.5 kB 660.6 kB/s eta 0:00:02
     - ----------------------------------- 30.7/981.5 kB 660.6 kB/s eta 0:00:02
     - ----------------------------------- 30.7/981.5 kB 660.6 kB/s eta 0:00:02
     ---- ------------------------------- 112.6/981.5 kB 656.4 kB/s eta 0:00:02
     ---- ------------------------------- 122.9/981.5 kB 599.1 kB/s eta 0:00:02
     ------- ---------------------------- 194.6/981.5 kB 737.3 kB/s eta 0:00:02
     -------- --------------------------- 235.5/981.5 kB 801.7 kB/s eta 0:00:01
     -------- --------------------------- 235.5/981.5 kB 801.7 kB/s eta 0:00:01
     ---------- ------------------------- 286.7/981.5 kB 737.3 kB/s eta 0:00:01
     ----------- ------------------------ 317.4/981.5 kB 785.7 kB/s eta 0:00:01
     ------------- ---------------------- 368.6/981.5 kB 7


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\Karthick Raja\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [12]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   - ------------------------------------- 30.7/991.5 kB 660.6 kB/s eta 0:00:02
   - ------------------------------------- 41.0/991.5 kB 495.5 kB/s eta 0:00:02
   -- ------------------------------------ 61.4/991.5 kB 469.7 kB/s eta 0:00:02
   -- ------------------------------------ 71.7/991.5 kB 435.7 kB/s eta 0:00:03
   --- ---------------------------------- 102.4/991.5 kB 454.0 kB/s eta 0:00:02
   ---- --------------------------------- 122.9/991.5 kB 450.6 kB/s eta 0:00:02
   ----- -------------------------------- 133.1/991.5 kB 413.7 kB/s eta 0:00:03
   ----- -------------------------------- 153.6/991.5 kB 437.1 kB/s eta 0:00:02
   ------ ------------------------------- 174.1/991.5 kB 456.4 kB/s eta 0:00:02
   ------- ----------------------------


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\Karthick Raja\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [15]:
pip install --upgrade transformers


Collecting transformers
  Using cached transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Using cached huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Using cached safetensors-0.4.5-cp311-none-win_amd64.whl.metadata (3.9 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Using cached tokenizers-0.20.2-cp311-none-win_amd64.whl.metadata (6.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.23.2->transformers)
  Using cached fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Using cached transformers-4.46.1-py3-none-any.whl (10.0 MB)
Downloading huggingface_hub-0.26.2-py3-none-any.whl (447 kB)
   ---------------------------------------- 0.0/447.5 kB ? eta -:--:--
   ----- ---------------------------------- 61.4/447.5 kB 3.2 MB/s eta 0:00:01
   ------ --------------------------------- 71.7/447.5 kB 1.3 MB/s eta 0:00:01
   ---------- -----------

  You can safely remove it manually.

[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\Karthick Raja\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip
