<a href="https://colab.research.google.com/github/konduruchandra/Customer-Transaction-Prediction/blob/main/Customer_Trans_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score,StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
#Load Data
df = pd.read_csv("/content/PRCP-1003-CustTransPred.zip",na_values='?')
df

NameError: name 'pd' is not defined

In [None]:
df.head()

Unnamed: 0,1,0,3,"Braund, Mr. Owen Harris",male,22,1.1,0.1,A/5 21171,7.25,Unnamed: 10,S
0,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
1,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
2,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
3,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
4,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890 entries, 0 to 889
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   1                        890 non-null    int64  
 1   0                        890 non-null    int64  
 2   3                        890 non-null    int64  
 3   Braund, Mr. Owen Harris  890 non-null    object 
 4   male                     890 non-null    object 
 5   22                       713 non-null    float64
 6   1.1                      890 non-null    int64  
 7   0.1                      890 non-null    int64  
 8   A/5 21171                890 non-null    object 
 9   7.25                     890 non-null    float64
 10  Unnamed: 10              204 non-null    object 
 11  S                        888 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


In [None]:
print(df.isnull().sum())

1                            0
0                            0
3                            0
Braund, Mr. Owen Harris      0
male                         0
22                         177
1.1                          0
0.1                          0
A/5 21171                    0
7.25                         0
Unnamed: 10                686
S                            2
dtype: int64


In [None]:
print("Missing Values:\n",df.isnull().sum().sum())

Missing Values:
 865


In [None]:
#Data Overview
print(df.describe())

                1           0           3          22         1.1         0.1  \
count  890.000000  890.000000  890.000000  713.000000  890.000000  890.000000   
mean   446.500000    0.384270    2.307865   29.709916    0.522472    0.382022   
std    257.065167    0.486696    0.836220   14.533827    1.103247    0.806409   
min      2.000000    0.000000    1.000000    0.420000    0.000000    0.000000   
25%    224.250000    0.000000    2.000000   20.000000    0.000000    0.000000   
50%    446.500000    0.000000    3.000000   28.000000    0.000000    0.000000   
75%    668.750000    1.000000    3.000000   38.000000    1.000000    0.000000   
max    891.000000    1.000000    3.000000   80.000000    8.000000    6.000000   

             7.25  
count  890.000000  
mean    32.232246  
std     49.714317  
min      0.000000  
25%      7.925000  
50%     14.454200  
75%     31.000000  
max    512.329200  


In [None]:
df.shape

(890, 12)

In [None]:
df['target'].value_counts(normalize=True)


KeyError: 'target'

In [None]:
sns.countplot(x="target",data = df)

plt.title("Target Class Distribution", fontweight="bold", color="Darkblue")

plt.xlabel("Targets - 0 & 1", size='12')
plt.ylabel("Number of Counts",size ='12')
plt.tight_layout()

In [None]:
X = df.drop(['ID_code','target'], axis=1)
X


In [None]:
y = df['target']
y

In [None]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(X)
x_scaled

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x_scaled, y, test_size=0.3, stratify=y, random_state=42)


In [None]:
#Model Evalution
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(),
    'SVM' : SVC(probability=True),
    'XGBoost': XGBClassifier(use_label_enoder=False,eval_metrics='logloss')
}

results = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name,model in models.items():
   scores = cross_val_score(model,X_train, y_train, cv=cv, scoring='roc_auc')
   results[name] = scores.mean()
   print(f"{name} AUC: {scores.mean():.3f}")


In [None]:
#Hyperparameter Tuning on Best model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
param_grid = {
     'n_estimators' : [100,200],
     'max_depth': [3,6],
     'learning_rate': [0.01,0.1],
}

grid = GridSearchCV(xgb, param_grid, scoring='roc_auc',cv=3)
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)

best_model = grid.best_estimator_

In [None]:
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1]

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nROC AUC:", roc_auc_score(y_test, y_proba))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
sns.heatmap(confusion_matrix(y_test, y_pred), annot =True, fmt='d', cbar=True)

In [None]:
model_df = pd.DataFrame(list(results.items()),columns=['Model','CV AUC Score'])
model_df = model_df.sort_values(by= 'CV AUC Score', ascending =False)
print("\nModel Comparison:\n", model_df)

In [None]:
#Generating predictions for each Customer
# If you want to predict on the train/test split used earlier:
X_all = df.drop(['ID_code', 'target'], axis=1)
ids = df['ID_code']

X_all_scaled = scaler.transform(X_all)
y_pred_all = best_model.predict(X_all_scaled)

customer_predictions = pd.DataFrame({
    'ID_code': ids,
    'Predicted_Transaction': y_pred_all
})

customer_predictions.head(100)
