In [1]:
import pandas as pd
import nltk
import numpy as np


In [2]:
# Load the datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Display the first few rows
print("Train Dataset:")
print(train_df.head())
print("\nTest Dataset:")
print(test_df.head())

Train Dataset:
                                category                       sub_category  \
0  Online and Social Media Related Crime  Cyber Bullying  Stalking  Sexting   
1                 Online Financial Fraud                  Fraud CallVishing   
2               Online Gambling  Betting           Online Gambling  Betting   
3  Online and Social Media Related Crime                   Online Job Fraud   
4                 Online Financial Fraud                  Fraud CallVishing   

                                  crimeaditionalinfo  
0  I had continue received random calls and abusi...  
1  The above fraudster is continuously messaging ...  
2  He is acting like a police and demanding for m...  
3  In apna Job I have applied for job interview f...  
4  I received a call from lady stating that she w...  

Test Dataset:
                                    category  \
0  RapeGang Rape RGRSexually Abusive Content   
1                     Online Financial Fraud   
2             Cyber A

In [3]:
print("Missing values in Train Dataset:")
print(train_df.isnull().sum())
print("\nMissing values in Test Dataset:")
print(test_df.isnull().sum())


Missing values in Train Dataset:
category                 0
sub_category          6591
crimeaditionalinfo      21
dtype: int64

Missing values in Test Dataset:
category                 0
sub_category          2236
crimeaditionalinfo       7
dtype: int64


In [4]:
# Fill missing values with empty strings (if the column is text-based)
train_df = train_df.fillna("")
test_df = test_df.fillna("")

In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Define a text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r"[^a-z\s]", "", text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and apply stemming
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    # Join tokens back into a string
    return " ".join(tokens)

# Apply preprocessing to the `crimeaditionalinfo` column
train_df["cleaned_text"] = train_df["crimeaditionalinfo"].apply(preprocess_text)
test_df["cleaned_text"] = test_df["crimeaditionalinfo"].apply(preprocess_text)

# Check the processed text
print(train_df[["crimeaditionalinfo", "cleaned_text"]].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akbas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akbas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                  crimeaditionalinfo  \
0  I had continue received random calls and abusi...   
1  The above fraudster is continuously messaging ...   
2  He is acting like a police and demanding for m...   
3  In apna Job I have applied for job interview f...   
4  I received a call from lady stating that she w...   

                                        cleaned_text  
0  continu receiv random call abus messag whatsap...  
1  fraudster continu messag ask pay money send fa...  
2  act like polic demand money ad section text me...  
3  apna job appli job interview telecal resourc m...  
4  receiv call ladi state send new phone vivo rec...  


In [7]:
from sklearn.preprocessing import LabelEncoder

# Encode categories with error handling for unseen labels
def transform_with_unseen_labels(encoder, values):
    try:
        return encoder.transform(values)
    except ValueError as e:
        # Handle unseen labels
        unseen_labels = [v for v in values if v not in encoder.classes_]
        if unseen_labels:
            print(f"Unseen labels found: {unseen_labels}")
        return np.array([encoder.transform([v])[0] if v in encoder.classes_ else -1 for v in values])

# Initialize the encoders
category_encoder = LabelEncoder()
sub_category_encoder = LabelEncoder()

# Encode the labels for training data
train_df["category_encoded"] = category_encoder.fit_transform(train_df["category"])
train_df["sub_category_encoded"] = sub_category_encoder.fit_transform(train_df["sub_category"])

# Handle unseen labels in test set
test_df["category_encoded"] = transform_with_unseen_labels(category_encoder, test_df["category"])
test_df["sub_category_encoded"] = transform_with_unseen_labels(sub_category_encoder, test_df["sub_category"])

# Check the encoded columns
print(train_df[["category", "category_encoded"]].drop_duplicates())
print(train_df[["sub_category", "sub_category_encoded"]].drop_duplicates())

Unseen labels found: ['Crime Against Women & Children', 'Crime Against Women & Children', 'Crime Against Women & Children', 'Crime Against Women & Children']
Unseen labels found: ['Computer Generated CSAM/CSEM', 'Computer Generated CSAM/CSEM', 'Cyber Blackmailing & Threatening', 'Sexual Harassment']
                                                category  category_encoded
0                  Online and Social Media Related Crime                 9
1                                 Online Financial Fraud                 7
2                               Online Gambling  Betting                 8
8              RapeGang Rape RGRSexually Abusive Content                11
9                                  Any Other Cyber Crime                 0
20                        Cyber Attack/ Dependent Crimes                 3
30                                  Cryptocurrency Crime                 2
39                                 Sexually Explicit Act                13
45                      

In [8]:
from sklearn.preprocessing import LabelEncoder

# Encode categories with error handling for unseen labels
def transform_with_unseen_labels(encoder, values):
    try:
        return encoder.transform(values)
    except ValueError as e:
        # Handle unseen labels
        unseen_labels = [v for v in values if v not in encoder.classes_]
        if unseen_labels:
            print(f"Unseen labels found: {unseen_labels}")
        return np.array([encoder.transform([v])[0] if v in encoder.classes_ else -1 for v in values])

# Initialize the encoders
category_encoder = LabelEncoder()
sub_category_encoder = LabelEncoder()

# Encode the labels for training data
train_df["category_encoded"] = category_encoder.fit_transform(train_df["category"])
train_df["sub_category_encoded"] = sub_category_encoder.fit_transform(train_df["sub_category"])

# Handle unseen labels in test set
test_df["category_encoded"] = transform_with_unseen_labels(category_encoder, test_df["category"])
test_df["sub_category_encoded"] = transform_with_unseen_labels(sub_category_encoder, test_df["sub_category"])

# Check the encoded columns
print(train_df[["category", "category_encoded"]].drop_duplicates())
print(train_df[["sub_category", "sub_category_encoded"]].drop_duplicates())

                                                category  category_encoded
0                  Online and Social Media Related Crime                10
1                                 Online Financial Fraud                 8
2                               Online Gambling  Betting                 9
8              RapeGang Rape RGRSexually Abusive Content                12
9                                  Any Other Cyber Crime                 0
20                        Cyber Attack/ Dependent Crimes                 4
30                                  Cryptocurrency Crime                 3
39                                 Sexually Explicit Act                14
45                             Sexually Obscene material                15
81        Hacking  Damage to computercomputer system etc                 6
197                                      Cyber Terrorism                 5
276    Child Pornography CPChild Sexual Abuse Materia...                 1
371                      

In [9]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

# Example: Fill missing values in the 'crimeaditionalinfo' column
train_df['crimeaditionalinfo'].fillna('', inplace=True)
test_df['crimeaditionalinfo'].fillna('', inplace=True)

category                0
sub_category            0
crimeaditionalinfo      0
cleaned_text            0
category_encoded        0
sub_category_encoded    0
dtype: int64
category                0
sub_category            0
crimeaditionalinfo      0
cleaned_text            0
category_encoded        0
sub_category_encoded    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['crimeaditionalinfo'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['crimeaditionalinfo'].fillna('', inplace=True)


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)  # Limit the number of features
train_text_features = vectorizer.fit_transform(train_df['crimeaditionalinfo'])
test_text_features = vectorizer.transform(test_df['crimeaditionalinfo'])

In [13]:
# Combine all features
train_features = sp.hstack([
    train_text_features, 
    train_df[['category_encoded', 'sub_category_encoded']]
])

test_features = sp.hstack([
    test_text_features, 
    test_df[['category_encoded', 'sub_category_encoded']]
])

In [14]:
print(train_df.columns)
print(test_df.columns)

Index(['category', 'sub_category', 'crimeaditionalinfo', 'cleaned_text',
       'category_encoded', 'sub_category_encoded'],
      dtype='object')
Index(['category', 'sub_category', 'crimeaditionalinfo', 'cleaned_text',
       'category_encoded', 'sub_category_encoded'],
      dtype='object')


In [15]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [16]:
print(train_df.columns)
print(test_df.columns)
# Now you can safely assign y_train
y_train = train_df['sub_category']
# Load your dataset
test_df = pd.read_csv('test.csv')

Index(['category', 'sub_category', 'crimeaditionalinfo'], dtype='object')
Index(['category', 'sub_category', 'crimeaditionalinfo'], dtype='object')


In [20]:
# Define features and target (labels)
X = train_df.drop('sub_category', axis=1)  # Assuming 'sub_category' is the target
y = train_df['sub_category']
x = test_df.drop('sub_category', axis=1)
y = test_df['sub_category']

In [21]:
print(train_df.dtypes)
print(test_df.dtypes)

category              object
sub_category          object
crimeaditionalinfo    object
dtype: object
category              object
sub_category          object
crimeaditionalinfo    object
dtype: object


In [22]:
from sklearn.preprocessing import LabelEncoder

# Example: Convert 'sub_category' (the target) to numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_df['sub_category'])

# Now proceed with splitting the data as before
X = train_df.drop('sub_category', axis=1)  # features
# Example: Convert 'sub_category' (the target) to numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(test_df['sub_category'])

# Now proceed with splitting the data as before
X = test_df.drop('sub_category', axis=1)  # features


In [23]:
# One-hot encode categorical columns in X
X = pd.get_dummies(train_df.drop('sub_category', axis=1))

# Now `X` contains numerical data, you can use it with RandomForestClassifier
y = train_df['sub_category']
# One-hot encode categorical columns in X
X = pd.get_dummies(test_df.drop('sub_category', axis=1))

# Now `X` contains numerical data, you can use it with RandomForestClassifier
y = test_df['sub_category']


In [24]:
# Example: Loading data from a CSV file into X
X = pd.read_csv('train.csv')

# Fill missing values with the most frequent category in each column
X = X.apply(lambda col: col.fillna(col.mode()[0]), axis=0)
# Example: Loading data from a CSV file into X
X = pd.read_csv('test.csv')

# Fill missing values with the most frequent category in each column
X = X.apply(lambda col: col.fillna(col.mode()[0]), axis=0)

In [26]:
chunk_size = 10000  # Adjust based on your memory capacity
for start in range(0, len(X), chunk_size):
    end = start + chunk_size
    chunk = X.iloc[start:end]
    chunk = chunk.apply(lambda col: col.fillna(col.mode()[0]), axis=0)
    # Append the modified chunk back to the original dataset
    X.iloc[start:end] = chunk

In [27]:
# Forward fill the missing values
X = X.fillna(method='ffill')

# Backward fill the missing values
# X = X.fillna(method='bfill')


  X = X.fillna(method='ffill')


In [28]:
import numpy as np

# Create a memory-mapped array if needed for large datasets
X = np.memmap('my_data.dat', dtype='bool', mode='w+', shape=(85028, 93686))

# After that, apply the operation
X = pd.DataFrame(X)
X = X.apply(lambda col: col.fillna(col.mode()[0]), axis=0)

In [29]:
# Check if there are any remaining missing values
missing_values = X.isnull().sum().sum()
print(f"Total missing values: {missing_values}")

# Verify the filled values by inspecting the first few rows
print(X.head())

Total missing values: 0
   0      1      2      3      4      5      6      7      8      9      ...  \
0  False  False  False  False  False  False  False  False  False  False  ...   
1  False  False  False  False  False  False  False  False  False  False  ...   
2  False  False  False  False  False  False  False  False  False  False  ...   
3  False  False  False  False  False  False  False  False  False  False  ...   
4  False  False  False  False  False  False  False  False  False  False  ...   

   93676  93677  93678  93679  93680  93681  93682  93683  93684  93685  
0  False  False  False  False  False  False  False  False  False  False  
1  False  False  False  False  False  False  False  False  False  False  
2  False  False  False  False  False  False  False  False  False  False  
3  False  False  False  False  False  False  False  False  False  False  
4  False  False  False  False  False  False  False  False  False  False  

[5 rows x 93686 columns]


In [30]:
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix

# Assuming X is your original dataset
X_sparse = csr_matrix(X)  # Convert to sparse format if it's sparse
scaler = StandardScaler(with_mean=False)  # Don't center sparse matrices
X_scaled = scaler.fit_transform(X_sparse)

In [1]:
import pandas as pd
df = pd.read_csv('train.csv')
X = df.drop('sub_category', axis=1)  # Features
y = df['sub_category']  # Target variable
df = pd.read_csv('test.csv')
X = df.drop('sub_category', axis=1)  # Features
y = df['sub_category']  # Target variable

In [2]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data  # Feature set
y = iris.target  # Target variable

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Example: Assume you have already loaded or prepared your data
# Example feature set (X) and target variable (y)
# train_features = some_dataframe_or_numpy_array_of_features
# y_train = some_array_or_series_of_labels

# You need to load or create your features and target variables before proceeding
# Example:
# X = df.drop('target_column', axis=1)  # Features
# y = df['target_column']  # Target variable

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split( iris.data , iris.target, test_size=0.2, random_state=500000)
# Split data into training and validation sets
X_test, X_val, y_test, y_val = train_test_split( iris.data , iris.target, test_size=0.2, random_state=50000)
# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)
# Train the model
model = RandomForestClassifier()
model.fit(X_test, y_test)
# Evaluate the model
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print("Accuracy:", accuracy_score(y_val, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.88      1.00      0.93         7
           2       1.00      0.92      0.96        12

    accuracy                           0.97        30
   macro avg       0.96      0.97      0.96        30
weighted avg       0.97      0.97      0.97        30

Accuracy: 0.9666666666666667


In [55]:
accuracy_percentage = accuracy_score(y_val, y_pred) * 100
print(f"Accuracy: {accuracy_percentage:.2f}%")


Accuracy: 96.67%
