<a href="https://colab.research.google.com/github/kowshii27-rbg/Crime-Data-Analysis-India/blob/main/DAV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [None]:
df = pd.read_csv('/content/drive/MyDrive/crime_dataset_india.csv')

In [None]:
print(df.head())

   Report Number     Date Reported Date of Occurrence Time of Occurrence  \
0              1  02-01-2020 00:00   01-01-2020 00:00   01-01-2020 01:11   
1              2  01-01-2020 19:00   01-01-2020 01:00   01-01-2020 06:26   
2              3  02-01-2020 05:00   01-01-2020 02:00   01-01-2020 14:30   
3              4  01-01-2020 05:00   01-01-2020 03:00   01-01-2020 14:46   
4              5  01-01-2020 21:00   01-01-2020 04:00   01-01-2020 16:51   

        City  Crime Code Crime Description  Victim Age Victim Gender  \
0  Ahmedabad         576    IDENTITY THEFT          16             M   
1    Chennai         128          HOMICIDE          37             M   
2   Ludhiana         271        KIDNAPPING          48             F   
3       Pune         170          BURGLARY          49             F   
4       Pune         421         VANDALISM          30             F   

    Weapon Used   Crime Domain  Police Deployed Case Closed  Date Case Closed  
0  Blunt Object  Violent Crime

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40160 entries, 0 to 40159
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Report Number       40160 non-null  int64 
 1   Date Reported       40160 non-null  object
 2   Date of Occurrence  40160 non-null  object
 3   Time of Occurrence  40160 non-null  object
 4   City                40160 non-null  object
 5   Crime Code          40160 non-null  int64 
 6   Crime Description   40160 non-null  object
 7   Victim Age          40160 non-null  int64 
 8   Victim Gender       40160 non-null  object
 9   Weapon Used         34370 non-null  object
 10  Crime Domain        40160 non-null  object
 11  Police Deployed     40160 non-null  int64 
 12  Case Closed         40160 non-null  object
 13  Date Case Closed    20062 non-null  object
dtypes: int64(4), object(10)
memory usage: 4.3+ MB
None


In [None]:
# Drop duplicates
df.drop_duplicates(inplace=True)

# Check and handle missing values
print(df.isnull().sum())
df.fillna(0, inplace=True)  # or use df.dropna()

# Remove unnamed or irrelevant columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]


Report Number             0
Date Reported             0
Date of Occurrence        0
Time of Occurrence        0
City                      0
Crime Code                0
Crime Description         0
Victim Age                0
Victim Gender             0
Weapon Used            5790
Crime Domain              0
Police Deployed           0
Case Closed               0
Date Case Closed      20098
dtype: int64


In [None]:
# Convert categorical columns to numeric
for col in df.select_dtypes(include='object').columns:
    # Check if the column has mixed types
    if df[col].apply(type).nunique() > 1:
        # Convert all values to strings before applying Label Encoding
        df[col] = df[col].astype(str)
    df[col] = LabelEncoder().fit_transform(df[col])

In [None]:
# Assuming first 2-3 columns are 'State', 'Year' etc., and rest are crimes
crime_cols = df.select_dtypes(include=np.number).columns.drop(['Year'], errors='ignore')

# Create total crime column
df['Total_Crime'] = df[crime_cols].sum(axis=1)

# Create a binary target: High Crime (1) if above median
df['Crime_Level'] = (df['Total_Crime'] > df['Total_Crime'].median()).astype(int)

# Drop Total_Crime from features
X = df.drop(columns=['Total_Crime', 'Crime_Level'])
y = df['Crime_Level']
print(df.head())
print(df.info())


   Report Number  Date Reported  Date of Occurrence  Time of Occurrence  City  \
0              1            836                   0                   0     1   
1              2              2                   1                   1     4   
2              3            838                   2                   3    15   
3              4              0                   3                   4    21   
4              5              3                   4                   5    21   

   Crime Code  Crime Description  Victim Age  Victim Gender  Weapon Used  \
0         576                 11          16              1            1   
1         128                 10          37              1            6   
2         271                 13          48              0            1   
3         170                  2          49              0            3   
4         421                 19          30              0            5   

   Crime Domain  Police Deployed  Case Closed  Date Case

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Model training and Prediction

RandomForestClassifier

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

Evaluation Metrics and Confusion Matrix

In [None]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[3993   53]
 [  66 3920]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4046
           1       0.99      0.98      0.99      3986

    accuracy                           0.99      8032
   macro avg       0.99      0.99      0.99      8032
weighted avg       0.99      0.99      0.99      8032


Accuracy Score: 0.9851842629482072


LogisticRegression

In [None]:
logreg = LogisticRegression(max_iter=1000, random_state=42)

In [None]:
# Train the model
logreg.fit(X_train_scaled, y_train)
# Predict on test set
y_pred_logreg = logreg.predict(X_test_scaled)

# Evaluation metrics
conf_matrix = confusion_matrix(y_test, y_pred_logreg)
class_report = classification_report(y_test, y_pred_logreg)
accuracy = accuracy_score(y_test, y_pred_logreg)


In [None]:
# Print results
print("Logistic Regression Evaluation:")
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
print("\nAccuracy Score:", accuracy)

Logistic Regression Evaluation:

Confusion Matrix:
 [[4039    7]
 [   0 3986]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4046
           1       1.00      1.00      1.00      3986

    accuracy                           1.00      8032
   macro avg       1.00      1.00      1.00      8032
weighted avg       1.00      1.00      1.00      8032


Accuracy Score: 0.9991284860557769


SVM

In [None]:
# Initialize SVM model
svm_model = SVC(kernel='rbf', random_state=42)

# Train the model
svm_model.fit(X_train_scaled, y_train)

# Predict on test set
y_pred_svm = svm_model.predict(X_test_scaled)

In [None]:
# Evaluation metrics
conf_matrix_svm = confusion_matrix(y_test, y_pred)
class_report_svm = classification_report(y_test, y_pred)
accuracy_svm = accuracy_score(y_test, y_pred)


In [None]:
# Print results
print("Support Vector Machine (SVM) Evaluation:")
print("\nConfusion Matrix:\n", conf_matrix_svm)
print("\nClassification Report:\n", class_report_svm)
print("\nAccuracy Score:", accuracy_svm)


Support Vector Machine (SVM) Evaluation:

Confusion Matrix:
 [[3993   53]
 [  66 3920]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4046
           1       0.99      0.98      0.99      3986

    accuracy                           0.99      8032
   macro avg       0.99      0.99      0.99      8032
weighted avg       0.99      0.99      0.99      8032


Accuracy Score: 0.9851842629482072
