**NAME : ABHINAV KRISHNA B**

**IMPORT LIBRARIES AND LOAD TRAIN , TEST DATASET**

In [None]:
import pandas as pd

# Load the training and test datasets
train_df = pd.read_csv('/content/drive/MyDrive/fraudTrain.csv')
test_df = pd.read_csv('/content/drive/MyDrive/fraudTest.csv')

# Explore the data
print(train_df.head())
print(test_df.head())

   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 393  ...  48

**DROP UNNECCESARY COLUMNS FOR PREDICTION**

In [None]:


# Drop columns that are not useful or have too many unique values in both datasets
columns_to_drop = ['Unnamed: 0','cc_num','first', 'last', 'street', 'city', 'state', 'zip', 'trans_num','lat','long','city_pop','merch_lat','merch_long','unix_time']
train_df = train_df.drop(columns_to_drop, axis=1)
test_df = test_df.drop(columns_to_drop, axis=1)




**DATA PREPROCESSING DATE-TIME EXTRACTION FROM DD-MM-YYYY FORMAT**

In [None]:
# Convert 'trans_date_trans_time' to datetime and extract features for both datasets
for df in [train_df, test_df]:
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')
    df['hour'] = df['trans_date_trans_time'].dt.hour
    df['day'] = df['trans_date_trans_time'].dt.day
    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
    df['month'] = df['trans_date_trans_time'].dt.month
    df['quarter'] = df['trans_date_trans_time'].dt.quarter




# Clean the dataframe by dropping unnecessary columns and handle 'merchant' column for both datasets
def clean_df(df):
    df = df.drop(['trans_date_trans_time', 'dob'], axis=1)
    df['merchant'] = df['merchant'].apply(lambda x: x.replace('fraud_', ''))
    return df

train_df = clean_df(train_df)
test_df = clean_df(test_df)



**CONVERTION OF CREDIT AMOUNT INTO SEGREGATED INTERVALS USING BINNING**

In [None]:

# Bin the 'amt' column for both datasets
num_bins = 300
for df in [train_df, test_df]:
    df['amt'] = pd.cut(df['amt'], bins=num_bins, labels=False, right=False)

**MAP CATEGORICAL ATTRIBUTES TO BINARY FOR PREDICTION**

In [None]:

def map_gender(df):
    df['gender_M'] = df['gender'].apply(lambda x: 1 if x == 'M' else 0)
    df['gender_F'] = df['gender'].apply(lambda x: 1 if x == 'F' else 0)
    return df.drop(['gender'], axis=1)

train_df = map_gender(train_df)
test_df = map_gender(test_df)



**LABEL ENCODING FOR STRING ATTRIBUTES LIKE MERCHANT,JOB AND CATEGORY OF CREDIT**

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
def encode(df):
    df['merchant'] = encoder.fit_transform(df['merchant'])
    df['category'] = encoder.fit_transform(df['category'])
    df['job'] = encoder.fit_transform(df['job'])
    return df

train_df = encode(train_df)
test_df = encode(test_df)

**DATASET AFTER PREPROCESSING**

In [None]:
print(train_df.head())
print(test_df.head())

   merchant  category  amt  job  is_fraud  hour  day  day_of_week  month  \
0       514         8    0  370         0     0    1            1      1   
1       241         4    1  428         0     0    1            1      1   
2       390         0    2  307         0     0    1            1      1   
3       360         2    0  328         0     0    1            1      1   
4       297         9    0  116         0     0    1            1      1   

   quarter  gender_M  gender_F  
0        1         0         1  
1        1         0         1  
2        1         1         0  
3        1         1         0  
4        1         1         0  
   merchant  category  amt  job  is_fraud  hour  day  day_of_week  month  \
0       319        10    0  275         0    12   21            6      6   
1       591        10    0  392         0    12   21            6      6   
2       611         5    0  259         0    12   21            6      6   
3       222         9    0  407         0

**TRAINING AND TESTING SPLIT OF DATASET**

In [None]:
from sklearn.model_selection import train_test_split

# Split the training data into features and target variable
X_train = train_df.drop('is_fraud', axis=1)
y_train = train_df['is_fraud']

# Split the test data into features and target variable
X_test = test_df.drop('is_fraud', axis=1)
y_test = test_df['is_fraud']


**NORMALIZATION FOR BETTER ACCURACY**

In [None]:
from sklearn.preprocessing import StandardScaler

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


**MODEL TRANING AND TESTING PHASE LOGISTIC REGRESSION**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Train Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test)

# Evaluate the model
print("Logistic Regression Results:")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))


Logistic Regression Results:
[[553059    515]
 [  2145      0]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719

Accuracy: 0.995213408215303


**DECISION TREE CLASSIFIER**

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Train Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model
print("Decision Tree Results:")
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))
print("Accuracy:", accuracy_score(y_test, y_pred_dt))


Decision Tree Results:
[[550430   3144]
 [  1101   1044]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.25      0.49      0.33      2145

    accuracy                           0.99    555719
   macro avg       0.62      0.74      0.66    555719
weighted avg       1.00      0.99      0.99    555719

Accuracy: 0.9923612473210381


**RANDOM FOREST CLASSSIFIER**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest Results:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Results:
[[552437   1137]
 [  1120   1025]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.47      0.48      0.48      2145

    accuracy                           1.00    555719
   macro avg       0.74      0.74      0.74    555719
weighted avg       1.00      1.00      1.00    555719

Accuracy: 0.9959385948653906


**CREATION OF PREDICTED FRAUD RATE INTO CSV FILE**

In [None]:
# Create a DataFrame with the predictions
test_df['is_fraud_pred'] = y_pred_rf

# Save to CSV
test_df[['is_fraud', 'is_fraud_pred']].to_csv('creditcard_test_predictions.csv', index=False)
