In [437]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from joblib import dump

In [438]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,criminal_id,Gender,Age,Family_record,Fin_status,Education,Population,Likelihood,violent,non_violent
0,1,Male,57,Unknown,Middle Class,Post Graduate,Urban,Less Likely,0,2
1,2,Male,24,Yes,Lower Class,School,Urban,Very Likely,2,1
2,3,Male,25,Yes,Middle Class,Graduate,Urban,Very Likely,2,3
3,4,Male,31,No,Below Poverty,School dropout,Rural,Neutral,1,2
4,5,Male,47,No,Middle Class,Graduate,Urban,Neutral,0,3


In [439]:
likelihood = data['Likelihood']
likelihood.drop([0], axis = 0, inplace=True)
likelihood

1      Very Likely
2      Very Likely
3          Neutral
4          Neutral
5      Less Likely
          ...     
96     Less Likely
97     Less Likely
98     Less Likely
99     Less Likely
100    Less Likely
Name: Likelihood, Length: 100, dtype: object

In [440]:
data = pd.read_csv('data.csv',usecols=[0,1,2,3,4,5,6,8,9])

In [441]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,criminal_id,Gender,Age,Family_record,Fin_status,Education,Population,violent,non_violent
0,1,Male,57,Unknown,Middle Class,Post Graduate,Urban,0,2
1,2,Male,24,Yes,Lower Class,School,Urban,2,1
2,3,Male,25,Yes,Middle Class,Graduate,Urban,2,3
3,4,Male,31,No,Below Poverty,School dropout,Rural,1,2
4,5,Male,47,No,Middle Class,Graduate,Urban,0,3


In [442]:
df.drop([0], axis = 0, inplace=True)
#df.rename(columns = {'crime':'crime_Violent','Unnamed: 9':'crime_NonViolent'},inplace=True)
df.drop(['criminal_id'], axis = 1, inplace=True)
df['Education'] = df['Education'].str.strip()

In [443]:
label_encoder = preprocessing.LabelEncoder()

df['Gender'] = label_encoder.fit_transform(df['Gender'])
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
for value, encoded_number in mapping.items():
    print(f"'{value}': {encoded_number}")

df['Education'] = label_encoder.fit_transform(df['Education'])
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
for value, encoded_number in mapping.items():
    print(f"'{value}': {encoded_number}")

df['Population'] = label_encoder.fit_transform(df['Population'])
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
for value, encoded_number in mapping.items():
    print(f"'{value}': {encoded_number}")

df['Family_record'] = label_encoder.fit_transform(df['Family_record'])
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
for value, encoded_number in mapping.items():
    print(f"'{value}': {encoded_number}")
    
df['Fin_status'] = label_encoder.fit_transform(df['Fin_status'])
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
for value, encoded_number in mapping.items():
    print(f"'{value}': {encoded_number}")
#df

'Female': 0
'Male': 1
'Graduate': 0
'Illiterate': 1
'Post Graduate': 2
'School': 3
'School dropout': 4
'Rural': 0
'Urban': 1
'No': 0
'Unknown': 1
'Yes': 2
'Below Poverty': 0
'Lower Class': 1
'Middle Class': 2
'Upper Class': 3


In [444]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
for value, encoded_number in mapping.items():
    print(f"'{value}', {encoded_number}")

'Below Poverty', 0
'Lower Class', 1
'Middle Class', 2
'Upper Class', 3


In [445]:
df.shape

(100, 8)

In [446]:
likelihood.shape

(100,)

In [447]:
x_train,x_test,y_train,y_test=train_test_split(df, likelihood, test_size=0.2, random_state=42)
#y_train


In [448]:
from sklearn.preprocessing import StandardScaler
#sc = StandardScaler()
#x_train = sc.fit_transform(x_train)
#x_test = sc.transform(x_test)

In [449]:
y_test.shape

(20,)

In [450]:
x_train.head()

Unnamed: 0,Gender,Age,Family_record,Fin_status,Education,Population,violent,non_violent
56,1,38,0,0,4,0,1,0
89,1,39,0,0,3,0,1,1
27,1,34,0,2,3,0,1,0
43,1,33,0,1,3,0,0,1
70,1,28,0,0,4,1,2,1


In [451]:
pac = PassiveAggressiveClassifier()
pac.fit(x_train.values,y_train.values)

In [452]:
df.columns

Index(['Gender', 'Age', 'Family_record', 'Fin_status', 'Education',
       'Population', 'violent', 'non_violent'],
      dtype='object')

In [453]:
x_train.head()

Unnamed: 0,Gender,Age,Family_record,Fin_status,Education,Population,violent,non_violent
56,1,38,0,0,4,0,1,0
89,1,39,0,0,3,0,1,1
27,1,34,0,2,3,0,1,0
43,1,33,0,1,3,0,0,1
70,1,28,0,0,4,1,2,1


In [454]:
y_pred = pac.predict(x_test)
score = accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 45.0%




In [455]:
classifier2 = RandomForestClassifier(n_estimators=3)
classifier2.fit(x_train,y_train)

In [456]:
pred2 = classifier2.predict(x_test)
score = accuracy_score(y_test,pred2)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 75.0%


In [457]:
dump(pac,'model.joblib')

['model.joblib']

In [458]:
pac.predict([[1, 21, 0, 2, 0, 0, 8, 1]])

array(['Very Likely'], dtype='<U11')