### Importing Libraries

In [195]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, confusion_matrix

### Loading Dataset

In [169]:
train_data = pd.read_csv("Training Data.csv")

In [170]:
train_data

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
251995,251996,8154883,43,13,single,rented,no,Surgeon,Kolkata,West_Bengal,6,11,0
251996,251997,2843572,26,10,single,rented,no,Army_officer,Rewa,Madhya_Pradesh,6,11,0
251997,251998,4522448,46,7,single,rented,no,Design_Engineer,Kalyan-Dombivli,Maharashtra,7,12,0
251998,251999,6507128,45,0,single,rented,no,Graphic_Designer,Pondicherry,Puducherry,0,10,0


In [171]:
test_data = pd.read_csv("Test Data.csv")

In [172]:
test_data

Unnamed: 0,ID,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS
0,1,7393090,59,19,single,rented,no,Geologist,Malda,West Bengal,4,13
1,2,1215004,25,5,single,rented,no,Firefighter,Jalna,Maharashtra,5,10
2,3,8901342,50,12,single,rented,no,Lawyer,Thane,Maharashtra,9,14
3,4,1944421,49,9,married,rented,yes,Analyst,Latur,Maharashtra,3,12
4,5,13429,25,18,single,rented,yes,Comedian,Berhampore,West Bengal,13,11
...,...,...,...,...,...,...,...,...,...,...,...,...
27995,27996,9955481,57,13,single,rented,no,Statistician,Eluru[25],Andhra Pradesh,5,10
27996,27997,2917765,47,9,single,rented,no,Technical writer,Ratlam,Madhya Pradesh,9,14
27997,27998,8082415,24,5,single,rented,no,Lawyer,Mira-Bhayandar,Maharashtra,4,13
27998,27999,9474180,51,13,single,rented,yes,Chartered Accountant,Bhilai,Chhattisgarh,13,14


In [173]:
sample_data = pd.read_csv("Sample Prediction Dataset.csv")

In [174]:
sample_data

Unnamed: 0,id,risk_flag
0,1,0
1,2,0
2,3,1
3,4,0
4,5,0
...,...,...
27995,27996,0
27996,27997,1
27997,27998,0
27998,27999,0


### Cleaning the dataset

In [175]:
# Cleaning Training Data
train_data['Profession'] = train_data['Profession'].str.replace('_', ' ')
train_data['CITY'] = train_data['CITY'].str.replace(r'\[.*?\]', '', regex=True)
train_data['CITY'] = train_data['CITY'].str.replace(r'[-_]', ' ', regex=True)
train_data['STATE'] = train_data['STATE'].str.replace(r'\[.*?\]', '', regex=True)
train_data['STATE'] = train_data['STATE'].str.replace(r'[-_]', ' ', regex=True)

In [176]:
# Cleaning Test Data
test_data['CITY'] = test_data['CITY'].str.replace(r'\[.*?\]', '', regex=True)
test_data['CITY'] = test_data['CITY'].str.replace(r'[-_]', ' ', regex=True)
test_data['STATE'] = test_data['STATE'].str.replace(r'\[.*?\]', '', regex=True)
test_data['STATE'] = test_data['STATE'].str.replace(r'[-_]', ' ', regex=True)
test_data = test_data.rename(columns={'ID': 'Id'})

### Combining training and test data

In [177]:
test_data['Risk_Flag']=sample_data['risk_flag']

In [178]:
combined_df = pd.concat([train_data, test_data], axis=0)

In [179]:
encoder_dict = {}
cols_to_encode = ['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE']

for col in cols_to_encode:
    encoder = LabelEncoder()
    combined_df[col] = encoder.fit_transform(combined_df[col])
    encoder_dict[col] = encoder

In [180]:
combined_df

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,1,2,0,33,250,13,3,13,0
1,2,7574516,40,10,1,2,0,43,226,14,9,13,0
2,3,3991815,66,4,0,2,0,47,8,12,4,10,0
3,4,6256451,41,2,1,2,1,43,53,17,2,12,1
4,5,5768871,47,11,1,2,0,11,295,22,3,14,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27995,27996,9955481,57,13,1,2,0,44,89,0,5,10,0
27996,27997,2917765,47,9,1,2,0,47,248,13,9,14,1
27997,27998,8082415,24,5,1,2,0,30,189,14,4,13,0
27998,27999,9474180,51,13,1,2,1,7,46,4,13,14,0


### Spliting the combined data into train and test dataset

In [181]:
rows_in_train_data = len(train_data)
train_data = combined_df.iloc[:rows_in_train_data]
test_data = combined_df.iloc[rows_in_train_data:]

In [182]:
X_train = train_data.drop(columns=['Risk_Flag', 'Id'],axis=1)  # Features
y_train = train_data['Risk_Flag']
X_test = test_data.drop(columns=['Risk_Flag', 'Id'],axis=1)
y_test = test_data['Risk_Flag']

In [183]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 252000 entries, 0 to 251999
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype
---  ------             --------------   -----
 0   Income             252000 non-null  int64
 1   Age                252000 non-null  int64
 2   Experience         252000 non-null  int64
 3   Married/Single     252000 non-null  int64
 4   House_Ownership    252000 non-null  int64
 5   Car_Ownership      252000 non-null  int64
 6   Profession         252000 non-null  int64
 7   CITY               252000 non-null  int64
 8   STATE              252000 non-null  int64
 9   CURRENT_JOB_YRS    252000 non-null  int64
 10  CURRENT_HOUSE_YRS  252000 non-null  int64
dtypes: int64(11)
memory usage: 23.1 MB


In [184]:
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

### Accuracy

In [185]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8716785714285714


### Confusion Matrix

In [196]:
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix

array([[24407,     0],
       [ 3593,     0]])