In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
    
from sklearn.preprocessing import LabelEncoder  
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics


In [32]:
data = pd.read_excel('crime.xlsx')

In [33]:
data.head()

Unnamed: 0,State,Year,Gender,Location,Victims_Above_50_Yrs,Victims_Total,Victims_Upto_10_15_Yrs,Victims_Upto_10_Yrs,Victims_Upto_15_18_Yrs,Victims_Upto_18_30_Yrs,Victims_Upto_30_50_Yrs
0,Delhi,2010,Female Victims,Hauz khas,14,152,2,12,9,74,41
1,Delhi,2011,Female Victims,Vasant vihar,14,130,3,6,2,69,36
2,Delhi,2012,Female Victims,Vasant kunj,23,131,6,8,7,44,43
3,Delhi,2013,Female Victims,Palam,16,125,7,14,7,51,30
4,Delhi,2014,Female Victims,Dwarka,15,158,3,7,3,87,43


In [34]:
data.shape

(78, 11)

In [35]:
data.isnull().sum()

State                     0
Year                      0
Gender                    0
Location                  0
Victims_Above_50_Yrs      0
Victims_Total             0
Victims_Upto_10_15_Yrs    0
Victims_Upto_10_Yrs       0
Victims_Upto_15_18_Yrs    0
Victims_Upto_18_30_Yrs    0
Victims_Upto_30_50_Yrs    0
dtype: int64

In [36]:
data.describe()

Unnamed: 0,Year,Victims_Above_50_Yrs,Victims_Total,Victims_Upto_10_15_Yrs,Victims_Upto_15_18_Yrs,Victims_Upto_18_30_Yrs,Victims_Upto_30_50_Yrs
count,78.0,78.0,78.0,78.0,78.0,78.0,78.0
mean,2014.384615,15.038462,189.448718,3.0,5.410256,89.051282,72.25641
std,2.834073,19.976247,242.007118,4.321496,8.356405,111.180665,103.824582
min,2010.0,0.0,2.0,0.0,0.0,0.0,0.0
25%,2012.0,2.0,14.25,0.0,0.0,5.25,5.25
50%,2014.0,6.0,75.5,1.0,1.0,29.0,24.0
75%,2017.0,23.0,351.5,5.0,8.0,135.25,99.25
max,2019.0,100.0,831.0,20.0,47.0,367.0,381.0


In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   State                   78 non-null     object
 1   Year                    78 non-null     int64 
 2   Gender                  78 non-null     object
 3   Location                78 non-null     object
 4   Victims_Above_50_Yrs    78 non-null     int64 
 5   Victims_Total           78 non-null     int64 
 6   Victims_Upto_10_15_Yrs  78 non-null     int64 
 7   Victims_Upto_10_Yrs     78 non-null     object
 8   Victims_Upto_15_18_Yrs  78 non-null     int64 
 9   Victims_Upto_18_30_Yrs  78 non-null     int64 
 10  Victims_Upto_30_50_Yrs  78 non-null     int64 
dtypes: int64(7), object(4)
memory usage: 6.8+ KB


In [38]:
dataset = pd.DataFrame()
dataset['State'] = data['State']
dataset['Year'] = data['Year']
dataset['Total_Cases'] = data['Victims_Total']

In [39]:
dataset.head()

Unnamed: 0,State,Year,Total_Cases
0,Delhi,2010,152
1,Delhi,2011,130
2,Delhi,2012,131
3,Delhi,2013,125
4,Delhi,2014,158


In [40]:
data['State'].value_counts()

Chandigarh    20
Goa           20
Delhi         19
Haryana       19
Name: State, dtype: int64

In [41]:
data['Location'].value_counts().head()

Sonipat            2
Hisar              2
sector 17          2
sector 8           2
Calangute Beach    2
Name: Location, dtype: int64

In [42]:
dataset.State.unique()

array(['Delhi', 'Haryana', 'Chandigarh', 'Goa'], dtype=object)

In [43]:
dataset.Year.unique()

array([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])

In [44]:
le = LabelEncoder() 

In [45]:
le

LabelEncoder()

In [46]:
dataset.iloc[:, 0] = le.fit_transform(dataset.iloc[:, 0])

In [47]:
dataset.head()

Unnamed: 0,State,Year,Total_Cases
0,1,2010,152
1,1,2011,130
2,1,2012,131
3,1,2013,125
4,1,2014,158


In [48]:
dataset.sample(10)

Unnamed: 0,State,Year,Total_Cases
77,2,2019,28
0,1,2010,152
72,2,2014,35
32,3,2014,669
52,2,2014,14
43,0,2015,4
14,3,2015,156
29,3,2011,655
75,2,2017,33
2,1,2012,131


In [49]:
x = dataset.iloc[:, : -1]
y = dataset.iloc[:, -1]

In [50]:
x.head()

Unnamed: 0,State,Year
0,1,2010
1,1,2011
2,1,2012
3,1,2013
4,1,2014


In [51]:
y.head()

0    152
1    130
2    131
3    125
4    158
Name: Total_Cases, dtype: int64

In [52]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 2)

In [53]:
re = LinearRegression()

In [54]:
re.fit(x_train, y_train)

LinearRegression()

In [55]:
y_pred = re.predict(x_test)

In [56]:
y_pred

array([312.36218098,  45.27727569, 173.00847331, 348.08793101,
       265.01392091, 229.28817089, 155.1455983 ,  45.27727569,
        -8.31134935, 137.28272329, 357.01936851,   9.55152567,
       101.55697327,  54.20871319, 119.41984828, 247.1510459 ])

In [57]:
pd.DataFrame(
    {
        "Actual": y_test,
        "Predicteed": y_pred
    }).head()

Unnamed: 0,Actual,Predicteed
28,675,312.362181
65,11,45.277276
27,458,173.008473
13,143,348.087931
75,33,265.013921


In [58]:
metrics.mean_absolute_error(y_test, y_pred)

168.9006737111472

In [59]:
metrics.r2_score(y_test, y_pred)

0.038442167671553173