In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('E://m//CUSTOMER CHURN dataset//Churn_Modelling.csv') 

In [3]:
print(df.head())
print(df.info())

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [4]:
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])

In [5]:
print(df.isnull().sum())


RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


In [6]:
df.dropna(inplace=True)


In [7]:
X = df.drop(['Exited'], axis=1) 
y = df['Exited']


In [8]:

print(df.columns)


Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')


In [9]:
X = df.drop(['Exited'], axis=1) 
y = df['Exited']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
import pandas as pd
df = pd.read_csv('E://m//CUSTOMER CHURN dataset//Churn_Modelling.csv')
df_numeric = df.apply(pd.to_numeric, errors='coerce')
print(df_numeric.isnull().sum()) 
df_cleaned = df_numeric.dropna()
df_cleaned = df_numeric.fillna(0)

RowNumber              0
CustomerId             0
Surname            10000
CreditScore            0
Geography          10000
Gender             10000
Age                    0
Tenure                 0
Balance                0
NumOfProducts          0
HasCrCard              0
IsActiveMember         0
EstimatedSalary        0
Exited                 0
dtype: int64


In [12]:
import pandas as pd
df = pd.read_csv('E://m//CUSTOMER CHURN dataset//Churn_Modelling.csv')
print(df.dtypes)

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object


In [13]:
non_numeric_columns = df.select_dtypes(include=['object']).columns
for col in non_numeric_columns:
    print(f"Non-numeric data in {col}:")
    print(df[col].unique())


Non-numeric data in Surname:
['Hargrave' 'Hill' 'Onio' ... 'Kashiwagi' 'Aldridge' 'Burbidge']
Non-numeric data in Geography:
['France' 'Spain' 'Germany']
Non-numeric data in Gender:
['Female' 'Male']


In [14]:
df.replace('p:An', np.nan, inplace=True)
df.dropna(inplace=True)

In [15]:
from sklearn.preprocessing import StandardScaler
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
X_train_numeric = df[numeric_columns] 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_numeric)

In [26]:
print(df.head(10))


   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   
5          6    15574012       Chu          645     Spain    Male   44   
6          7    15592531  Bartlett          822    France    Male   50   
7          8    15656148    Obinna          376   Germany  Female   29   
8          9    15792365        He          501    France    Male   44   
9         10    15592389        H?          684    France    Male   27   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0       

In [16]:
X_train_cleaned = df[numeric_columns]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_cleaned)


In [17]:
def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)


In [18]:
import pandas as pd
data = pd.read_csv('E://m//CUSTOMER CHURN dataset//Churn_Modelling.csv')
X = data.iloc[:, :-1]
y = data.iloc[:, -1] 


In [19]:
print(data.isnull().sum())


RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


In [20]:
y = data.iloc[:, -1].values.flatten()

In [21]:
data['Surname'] = data['Surname'].astype(str).apply(lambda x: float(x) if x.isnumeric() else np.nan)

In [22]:
import pandas as pd
from sklearn.model_selection import cross_val_predict
from sklearn.tree import DecisionTreeClassifier
data = pd.read_csv('E://m//CUSTOMER CHURN dataset//Churn_Modelling.csv')
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
model = DecisionTreeClassifier()
y_pred = cross_val_predict(model, X, y, cv=4) 
evaluate_model(y, y_pred)

ValueError: could not convert string to float: 'Johnstone'