In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler 

In [4]:
data=pd.read_csv('FullData.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Sl.No.,State/UT,Literacy %,Unemployment,Sex Ratio,Greed,Loath,Love,Year,Region,Urbanization,NSDP per capita,Population Density,Homeless prop,Alcohol %,Below poverty line,Total Murders
0,0,1,Andhra Pradesh,70.0,4.7,1001.786755,120.475113,50.339367,87.104072,2016.0,S,0.326723,1.826199,0.316234,1.188334,46.40598,46.603568,257.918552
1,1,2,Assam,79.0,6.9,1040.271327,1796.774194,996.774194,225.806452,2016.0,NE,0.147812,1.604666,0.425648,0.386947,48.420071,99.861653,3019.354839
2,2,3,Bihar,68.5,7.2,1086.736819,1008.77193,335.839599,244.360902,2016.0,E,0.117615,0.220935,1.22357,0.395643,40.752563,351.23518,1588.972431
3,3,4,Chattisgarh,75.0,2.9,1006.931408,55.710306,63.6026,61.74559,2016.0,C,0.250648,2.415419,0.205605,0.871133,36.147755,102.16577,181.058496
4,4,5,Gujarat,79.0,3.3,1096.833908,167.907361,209.263854,167.080232,2016.0,W,0.455098,1.974763,0.335153,2.196505,20.798578,100.8377,544.251447


In [5]:
data.drop(['Unnamed: 0'],axis=1,inplace=True)

## Modelling

In [13]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [14]:
df_encoded = pd.get_dummies(data, columns=['Region'])

In [15]:
features = df_encoded.drop(['Sl.No.','State/UT','Greed','Loath','Love','Total Murders'],axis=1).columns

In [16]:
X = df_encoded[features]
y = df_encoded['Total Murders']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
# Models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet Regression': ElasticNet(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'SVR': SVR()
}

# Evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}

# Display results
results_df = pd.DataFrame(results).T
print(results_df)

                              MAE           MSE         RMSE        R2
Linear Regression      427.092630  4.800344e+05   692.845146  0.505756
Ridge Regression       425.695299  4.818613e+05   694.162334  0.503875
Lasso Regression       427.397314  4.835330e+05   695.365351  0.502154
ElasticNet Regression  398.767142  5.667003e+05   752.794973  0.416525
Random Forest          183.963073  1.413022e+05   375.901836  0.854515
Gradient Boosting      121.266061  4.826991e+04   219.704146  0.950301
SVR                    594.277046  1.037456e+06  1018.556073 -0.068165
