In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
#Ml Models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
df = pd.read_csv('crimes.dataset.csv')

In [5]:
print(df.head())
print(df.info())

   Year   Population Violent Crimes  Violent crime rate  \
0  2000  281,421,906      1,425,486               506.5   
1  2001  285,317,559      1,439,480               504.5   
2  2002  287,973,924      1,423,677               494.4   
3  2003  290,788,976      1,383,676               475.8   
4  2004  293,656,842      1,360,088               463.2   

  Murder nonnegligent manslaughter  Murder nonnegligent manslaughter rate  \
0                           15,586                                    5.5   
1                           16,037                                    5.6   
2                           16,229                                    5.6   
3                           16,528                                    5.7   
4                           16,148                                    5.5   

  Rape\n(revised\ndefinition)  Rape\n(revised\ndefinition)\nrate  \
0                         NaN                                NaN   
1                         NaN                 

In [7]:
df.columns = df.columns.str.strip()

In [8]:
df = df.drop(columns=[
    "Rape (legacy definition)",
    "Rape (legacy definition) rate",
    "Rape (revised definition)",
    "Rape (revised definition) rate"
], errors='ignore')

In [9]:
df = df.dropna()

In [10]:
X = df.drop(columns=['Violent crime rate', 'Year'])  # Year can be dropped if not needed
y = df['Violent crime rate']

In [12]:
for col in X.columns:
    X[col] = X[col].astype(str).str.replace(',', '')
    X[col] = pd.to_numeric(X[col], errors='coerce')

In [13]:
X = X.dropna()
y = y[X.index]  

In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [16]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [17]:
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Random Forest - RMSE: {rmse:.2f}, R²: {r2:.2f}")

Random Forest - RMSE: 11.81, R²: -8.92


In [18]:
joblib.dump(model, "crime_rate_model.pkl")

['crime_rate_model.pkl']

In [19]:
joblib.dump(scaler, "crime_scaler.pkl")

['crime_scaler.pkl']