### Perform pre-processing on any real-time dataset from any of the repositories.

In [1]:
# Importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Getting dataset
dataset_path = 'Dataset/cyber_crimes.csv'

# Importing dataset
dataset = pd.read_csv(dataset_path)
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,S. No,Category,State/UT,2016,2017,2018,Percentage Share of State/UT (2018),Mid-Year Projected Population (in Lakhs) (2018)+,Rate of Total Cyber Crimes (2018)++
0,1,State,Andhra Pradesh,616,931,1207,4.4,520.3,2.3
1,2,State,Arunachal Pradesh,4,1,7,0.0,14.9,0.5
2,3,State,Assam,696,1120,2022,7.4,340.4,5.9
3,4,State,Bihar,309,433,374,1.4,1183.3,0.3
4,5,State,Chhattisgarh,90,171,139,0.5,284.7,0.5


In [3]:
# Finding missing data
missing_values = df.isnull().sum()
print('Missing Values:')
missing_values

Missing Values:


S. No                                               0
Category                                            0
State/UT                                            0
2016                                                0
2017                                                0
2018                                                0
Percentage Share of State/UT (2018)                 0
Mid-Year Projected Population (in Lakhs) (2018)+    0
Rate of Total Cyber Crimes (2018)++                 0
dtype: int64

In [26]:
# Categorical features
categorical_feat = [feature for feature in df.columns if df[feature].dtypes=='O']
print('Total categorical features: ', len(categorical_feat))
print('\n', categorical_feat)

Total categorical features:  2

 ['Category', 'State/UT']


In [27]:
# Encoding categorical data
for f in df.columns:
    if df[f].dtype=='object': 
        label_encoder = LabelEncoder()
        label_encoder.fit(list(df[f].values))
        df[f] = label_encoder.transform(list(df[f].values))
df.head()

Unnamed: 0,S. No,Category,State/UT,2016,2017,2018,Percentage Share of State/UT (2018),Mid-Year Projected Population (in Lakhs) (2018)+,Rate of Total Cyber Crimes (2018)++
0,1,0,1,616,931,1207,4.4,520.3,2.3
1,2,0,2,4,1,7,0.0,14.9,0.5
2,3,0,3,696,1120,2022,7.4,340.4,5.9
3,4,0,4,309,433,374,1.4,1183.3,0.3
4,5,0,6,90,171,139,0.5,284.7,0.5


In [28]:
# Splitting data into training and test set
features = df.drop('Rate of Total Cyber Crimes (2018)++', axis=1)
target = df['Rate of Total Cyber Crimes (2018)++']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [29]:
# Displaying the training set
train_set = pd.concat([X_train, y_train], axis=1)
print("\n\nTraining Set:")
print(train_set.head(10))

# Displaying the test set
test_set = pd.concat([X_test, y_test], axis=1)
print("\nTest Set:")
print(test_set.head(10))



Training Set:
    S. No  Category  State/UT  2016  2017  2018  \
8       9         0        13    31    56    69   
17     18         0        23     1    10     6   
9      10         0        14    28    63    73   
34     35         1        18     0     0     4   
0       1         0         1   616   931  1207   
4       5         0         6    90   171   139   
29     30         1         0     3     3     7   
15     16         0        21    11    74    29   
19     20         0        25   317   824   843   
5       6         0        10    31    13    29   

    Percentage Share of State/UT (2018)  \
8                                   0.3   
17                                  0.0   
9                                   0.3   
34                                  0.0   
0                                   4.4   
4                                   0.5   
29                                  0.0   
15                                  0.1   
19                                 

In [34]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [37]:
# Displaying the scaled training set
train_set_scaled = pd.DataFrame(X_train_scaled, columns=features.columns)
train_set_scaled['Rate of Total Cyber Crimes (2018)++'] = y_train
print("\n\nScaled Training Set:")
print(train_set_scaled.head(10))



Scaled Training Set:
      S. No  Category  State/UT      2016      2017      2018  \
0 -0.761687 -0.408248 -0.366286 -0.535204 -0.534578 -0.478329   
1  0.102931 -0.408248  0.574632 -0.597129 -0.587275 -0.528751   
2 -0.665618 -0.408248 -0.272194 -0.541397 -0.526559 -0.475127   
3  1.736097  2.449490  0.104173 -0.599193 -0.598731 -0.530352   
4 -1.530236 -0.408248 -1.495387  0.672323  0.467807  0.432480   
5 -1.145961 -0.408248 -1.024928 -0.413420 -0.402836 -0.422304   
6  1.255754  2.449490 -1.589478 -0.593001 -0.595294 -0.527951   
7 -0.089207 -0.408248  0.386448 -0.576487 -0.513958 -0.510343   
8  0.295068 -0.408248  0.762815  0.055142  0.345230  0.141149   
9 -1.049893 -0.408248 -0.648561 -0.535204 -0.583839 -0.510343   

   Percentage Share of State/UT (2018)  \
0                            -0.467742   
1                            -0.533225   
2                            -0.467742   
3                            -0.533225   
4                             0.427204   
5        

In [36]:
# Displaying the scaled test set
test_set_scaled = pd.DataFrame(X_test_scaled, columns=features.columns)
test_set_scaled['Rate of Total Cyber Crimes (2018)++'] = y_test
print("\nScaled Test Set:")
print(test_set_scaled.head(10))


Scaled Test Set:
      S. No  Category  State/UT      2016      2017      2018  \
0  1.832165  2.449490  0.856907 -0.595065 -0.593003 -0.522348   
1 -0.281344 -0.408248  0.198265 -0.066643 -0.037395  0.058712   
2  0.967548 -0.408248  1.515549  4.848097  5.095965  4.492700   
3  1.351822  2.449490 -1.119020 -0.545525 -0.562072 -0.509543   
4  0.006862 -0.408248  0.480540 -0.518691 -0.554053 -0.474327   
5  1.447891  2.449490 -0.930836 -0.597129 -0.597586 -0.533554   
6  0.487205 -0.408248  1.045090  1.343172  0.895110  0.350043   
7 -0.377412 -0.408248  0.010081 -0.015039 -0.232144 -0.261431   

   Percentage Share of State/UT (2018)  \
0                            -0.511397   
1                             0.056129   
2                             4.487201   
3                            -0.511397   
4                            -0.467742   
5                            -0.533225   
6                             0.361720   
7                            -0.271290   

   Mid-Year Proje